Skip to content

Instantly share code, notes, and snippets.

@benrules2 benrules2/album_counter.py
Last active Aug 21, 2017

Embed
What would you like to do?
Main script file for counting duplicate sets of words in album lyrics
client_id = 'Your Genius Client ID'
client_secret = 'Your Genius Client Secret'
client_access_token = 'Your Genius Access Token'
import plot_albums
import string
import requests
import nltk
from sys import argv
from bs4 import BeautifulSoup
from nltk.util import ngrams
from collections import Counter
class ArtistQuery:
def __init__(self, token):
self.token = token
self.headers = {'Authorization': 'Bearer ' + str(token)}
self.base_url = "http://api.genius.com"
def __get_artist_id__(self, artist_name, song_name=""):
data = {'q': artist_name + ' ' + song_name}
response = requests.get(self.base_url + "/search", params=data, headers=self.headers).json()
print("found artist " + response['response']['hits'][0]['result']['primary_artist']['name'])
return response['response']['hits'][0]['result']['primary_artist']['id']
def get_songs_by_artist(self, artist_name, song=""):
songs = []
page = 1
id = self.__get_artist_id__(artist_name, song)
last_page = False
while not last_page:
data = {'per_page': '50', 'page': str(page)}
response = requests.get(self.base_url + "/artists/" + str(id) + '/songs', params=data,
headers=self.headers).json()
for song in response['response']['songs']:
songs.append(str(song['api_path']))
if response['response']['next_page']:
page = response['response']['next_page']
else:
last_page = True
return songs
def build_albums(self, songs):
albums = {}
for song in songs:
response = requests.get(self.base_url + song, headers=self.headers).json()
if response['response']['song']['album']:
albums.setdefault(response['response']['song']['album']['name'], []).append(
response["response"]["song"]["path"])
return albums
def build_album_corpus(self, album):
album_lyrics = ""
for song in album:
page = requests.get("http://genius.com" + song)
html = BeautifulSoup(page.text, "html.parser")
[h.extract() for h in html('script')]
lyrics = html.find("div", class_="lyrics").get_text() # updated css where the lyrics are based in HTML
lyrics = lyrics.replace('\n', ' ')
lyrics = lyrics.replace('\'', '')
for c in string.punctuation:
lyrics = lyrics.replace(c, "")
album_lyrics += lyrics.lower()
return album_lyrics
def get_album_csv(self, artist_name, filename=None):
songs = artist.get_songs_by_artist(artist_name)
albums = artist.build_albums(songs)
if (filename):
file = open(filename, "w")
file.write("album, word count, ngrams: 2,3,4")
albums_list = []
for album in albums:
if len(albums[album]) < 3:
continue
lyrics = artist.build_album_corpus(albums[album])
token = nltk.word_tokenize(lyrics)
album_name = album.replace(",",'')
if len(album_name) > 18:
album_name = album_name[:15] + '...'
csv_line = album_name + ','
csv_line += str(len(token)) + ','
try:
for i in range(2, 5):
count = 0
counter = Counter(ngrams(token, i))
for key in counter:
count += counter[key] - 1
csv_line += str(count) + ','
print(csv_line)
albums_list.append(csv_line.split(","))
if (filename):
file.write(csv_line + '\n')
except:
pass
return albums_list
if __name__ == '__main__':
artist_name = argv[1]
artist = ArtistQuery(client_access_token)
albums_list = artist.get_album_csv(artist_name, artist_name + '.csv')
plot_albums.plot_album_ngrams(albums_list, artist_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.