mencarellic/main.py

## main.py
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

# Show all columns and do not truncate in a DF
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

# set random seed here
seed = 0
np.random.seed(seed)

# Function to find n grams given a list of text
def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

# Function to round an integer formatted year to the decade
def round_down(num):
    try:
        return num - (num % 10)
    except:
        return num

# Function that takes a stopwords list, min frequency, and a list of sentences and returns a counter object
def word_freq(stopwords, min_df, data):
    vectorizer = CountVectorizer(min_df=min_df, stop_words=stopwords)
    fitted = vectorizer.fit_transform(data)
    vocab = list(vectorizer.get_feature_names())
    counts = fitted.sum(axis=0).A1
    return Counter(dict(zip(vocab, counts)))

# initialize regex pattern
alpha_regex = re.compile('[^a-zA-Z\s\']')
apost_regex = re.compile('[\']')
space_regex = re.compile(' {2,}')

# set stopwords list
base_stop = [apost_regex.sub('', word) for word in nltk.corpus.stopwords.words('english')]
extra_stop = ['']
all_stop = set(base_stop + extra_stop)

# cut words that are this many characters or less
minwordlen = 2

# Location of datafile
datafile = 'C:\\Git\\IST652\\Project\\data\\combined.csv'

# Read in CSV
df = pd.read_csv(datafile)

# drop the mongodb id column
df.drop(axis=1, columns=['Column1', 'combined'], inplace=True)

# get decade of song
df['decade'] = round_down(df['year'])

# remove new line characters (\r \n)
df.replace(r'(\r|\n|\r\n)', ' ', regex=True, inplace=True)

# make everything lowercase
for col in df.columns.values:
    try:
        df[col] = df[col].str.lower()
    except:
        pass

# Get a list of all lyrics that are cleaned up
tmp_all_lyrics = df['text'].tolist()
all_lyrics = [space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', song))) for song in tmp_all_lyrics]
all_lyrics_freq = word_freq(stopwords=all_stop, min_df=5, data=all_lyrics)

# Get list of all artists
artists = df['artist'].unique()
artist_lyrics = pd.DataFrame(columns=['artist', 'lyrics', 'nostop'])

# iterate over artists and create a df with the artist and lyrics
for artist in artists:
    artist_index = len(artist_lyrics.index)
    lyrics = df[df['artist'] == artist]['text'].tolist()
    tmp = ''.join([lyric for lyric in lyrics])
    cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp)))
    no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen])
    artist_lyrics.loc[artist_index] = [artist, cleaned, no_stop]

# Generate tokens, bigrams, and word frequencies
artist_lyrics['tokens'] = artist_lyrics['nostop'].map(lambda x: x.split(' '))
artist_lyrics['bigrams'] = artist_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2))
artist_lyrics['top25'] = artist_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25))

# Get list of all genres
genres = df['genre'].unique()
genre_lyrics = pd.DataFrame(columns=['genre', 'lyrics', 'nostop'])

# iterate over genres and create a df with the genre and lyrics
for genre in genres:
    genre_index = len(genre_lyrics.index)
    lyrics = df[df['genre'] == genre]['text'].tolist()
    tmp = ''.join([lyric for lyric in lyrics])
    cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp)))
    no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen])
    genre_lyrics.loc[genre_index] = [genre, cleaned, no_stop]

# Generate tokens and bigrams
genre_lyrics['tokens'] = genre_lyrics['nostop'].map(lambda x: x.split(' '))
genre_lyrics['bigrams'] = genre_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2))
genre_lyrics['top25'] = genre_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25))


# Get list of all decades
decades = df['decade'].unique()
decade_lyrics = pd.DataFrame(columns=['decade', 'lyrics', 'nostop'])

# iterate over genres and create a df with the genre and lyrics
for decade in decades:
    decade_index = len(decade_lyrics.index)
    lyrics = df[df['decade'] == decade]['text'].tolist()
    tmp = ''.join([lyric for lyric in lyrics])
    cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp)))
    no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen])
    decade_lyrics.loc[decade_index] = [decade, cleaned, no_stop]

# Generate tokens and bigrams
decade_lyrics['tokens'] = decade_lyrics['nostop'].map(lambda x: x.split(' '))
decade_lyrics['bigrams'] = decade_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2))
decade_lyrics['top25'] = decade_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25))


# Export to csv
artist_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\artists.csv', sep='\t', index=False)
genre_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\genre.csv', sep='\t', index=False)
decade_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\decade.csv', sep='\t', index=False)
	import pandas as pd
	import numpy as np
	import re
	import nltk
	from sklearn.feature_extraction.text import CountVectorizer
	from collections import Counter

	# Show all columns and do not truncate in a DF
	pd.set_option('display.max_columns', None)
	pd.set_option('display.max_colwidth', -1)

	# set random seed here
	seed = 0
	np.random.seed(seed)

	# Function to find n grams given a list of text
	def find_ngrams(input_list, n):
	return list(zip(*[input_list[i:] for i in range(n)]))

	# Function to round an integer formatted year to the decade
	def round_down(num):
	try:
	return num - (num % 10)
	except:
	return num

	# Function that takes a stopwords list, min frequency, and a list of sentences and returns a counter object
	def word_freq(stopwords, min_df, data):
	vectorizer = CountVectorizer(min_df=min_df, stop_words=stopwords)
	fitted = vectorizer.fit_transform(data)
	vocab = list(vectorizer.get_feature_names())
	counts = fitted.sum(axis=0).A1
	return Counter(dict(zip(vocab, counts)))

	# initialize regex pattern
	alpha_regex = re.compile('[^a-zA-Z\s\']')
	apost_regex = re.compile('[\']')
	space_regex = re.compile(' {2,}')

	# set stopwords list
	base_stop = [apost_regex.sub('', word) for word in nltk.corpus.stopwords.words('english')]
	extra_stop = ['']
	all_stop = set(base_stop + extra_stop)

	# cut words that are this many characters or less
	minwordlen = 2

	# Location of datafile
	datafile = 'C:\\Git\\IST652\\Project\\data\\combined.csv'

	# Read in CSV
	df = pd.read_csv(datafile)

	# drop the mongodb id column
	df.drop(axis=1, columns=['Column1', 'combined'], inplace=True)

	# get decade of song
	df['decade'] = round_down(df['year'])

	# remove new line characters (\r \n)
	df.replace(r'(\r\|\n\|\r\n)', ' ', regex=True, inplace=True)

	# make everything lowercase
	for col in df.columns.values:
	try:
	df[col] = df[col].str.lower()
	except:
	pass

	# Get a list of all lyrics that are cleaned up
	tmp_all_lyrics = df['text'].tolist()
	all_lyrics = [space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', song))) for song in tmp_all_lyrics]
	all_lyrics_freq = word_freq(stopwords=all_stop, min_df=5, data=all_lyrics)

	# Get list of all artists
	artists = df['artist'].unique()
	artist_lyrics = pd.DataFrame(columns=['artist', 'lyrics', 'nostop'])

	# iterate over artists and create a df with the artist and lyrics
	for artist in artists:
	artist_index = len(artist_lyrics.index)
	lyrics = df[df['artist'] == artist]['text'].tolist()
	tmp = ''.join([lyric for lyric in lyrics])
	cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp)))
	no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen])
	artist_lyrics.loc[artist_index] = [artist, cleaned, no_stop]

	# Generate tokens, bigrams, and word frequencies
	artist_lyrics['tokens'] = artist_lyrics['nostop'].map(lambda x: x.split(' '))
	artist_lyrics['bigrams'] = artist_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2))
	artist_lyrics['top25'] = artist_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25))

	# Get list of all genres
	genres = df['genre'].unique()
	genre_lyrics = pd.DataFrame(columns=['genre', 'lyrics', 'nostop'])

	# iterate over genres and create a df with the genre and lyrics
	for genre in genres:
	genre_index = len(genre_lyrics.index)
	lyrics = df[df['genre'] == genre]['text'].tolist()
	tmp = ''.join([lyric for lyric in lyrics])
	cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp)))
	no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen])
	genre_lyrics.loc[genre_index] = [genre, cleaned, no_stop]

	# Generate tokens and bigrams
	genre_lyrics['tokens'] = genre_lyrics['nostop'].map(lambda x: x.split(' '))
	genre_lyrics['bigrams'] = genre_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2))
	genre_lyrics['top25'] = genre_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25))


	# Get list of all decades
	decades = df['decade'].unique()
	decade_lyrics = pd.DataFrame(columns=['decade', 'lyrics', 'nostop'])

	# iterate over genres and create a df with the genre and lyrics
	for decade in decades:
	decade_index = len(decade_lyrics.index)
	lyrics = df[df['decade'] == decade]['text'].tolist()
	tmp = ''.join([lyric for lyric in lyrics])
	cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp)))
	no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen])
	decade_lyrics.loc[decade_index] = [decade, cleaned, no_stop]

	# Generate tokens and bigrams
	decade_lyrics['tokens'] = decade_lyrics['nostop'].map(lambda x: x.split(' '))
	decade_lyrics['bigrams'] = decade_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2))
	decade_lyrics['top25'] = decade_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25))


	# Export to csv
	artist_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\artists.csv', sep='\t', index=False)
	genre_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\genre.csv', sep='\t', index=False)
	decade_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\decade.csv', sep='\t', index=False)