Skip to content

Instantly share code, notes, and snippets.

@jackschultz
Created November 23, 2016 17:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jackschultz/868ae6b7828acfa8760d83e582e59ccb to your computer and use it in GitHub Desktop.
Save jackschultz/868ae6b7828acfa8760d83e582e59ccb to your computer and use it in GitHub Desktop.
For artist name in artist_names list, this will use Genius' API and website to download the info and lyrics (as best can be done with html scraping) into named folders in current directory. Need Geinus API Bearer token as well.
import requests
from bs4 import BeautifulSoup
import os, json
base_url = "http://api.genius.com"
headers = {'Authorization': 'Bearer GENIUS_API_BEARER_STRING'}
artist_names = ["Fleet Foxes"]
def artist_id_from_song_api_path(song_api_path, artist_name):
song_url = base_url + song_api_path
response = requests.get(song_url, headers=headers)
json = response.json()
artist = json["response"]["song"]["primary_artist"]
if artist["name"] == artist_name:
return artist["api_path"]
else:
return None
def songs_from_artist_api_path(artist_api_path):
api_paths = []
artist_url = base_url + artist_api_path + "/songs"
data = {"per_page": 50}
while True:
response = requests.get(artist_url, data=data, headers=headers)
json = response.json()
songs = json["response"]["songs"]
for song in songs:
api_paths.append(song["api_path"])
if len(songs) < 50:
break #no more songs for artist
else:
if "page" in data:
data["page"] = data["page"] + 1
else:
data["page"] = 1
return list(set(api_paths))
def info_from_song_api_path(song_api_path):
song_url = base_url + song_api_path
response = requests.get(song_url, headers=headers)
json = response.json()
return json
def lyrics_from_song_web_path(song_web_path):
#gotta go regular scraping... come on Genius
page_url = "http://genius.com" + song_web_path
page = requests.get(page_url)
html = BeautifulSoup(page.text, "html.parser")
[h.extract() for h in html('script')]
lyrics = html.find("lyrics").get_text()
return clean_lyrics(lyrics)
def song_ids_already_scraped(artist_folder_path, force=False):
#check for ids already scraped so we don't redo
if force:
return []
song_ids = []
files = os.listdir(artist_folder_path)
for file_name in files:
dot_split = file_name.split('.')
#sometimes the file is empty, we don't want to include if that's the case
if dot_split[1] == 'txt':
try:
song_id = dot_split[0].split("_")[-1]
if os.path.getsize(artist_folder_path + '/' + file_name) != 0:
song_ids.append(song_id)
except:
pass
return song_ids
def clean_lyrics(lyrics):
lyrics = lyrics.replace(u"\u2019", "'") #right quotation mark
lyrics = lyrics.replace(u"\u2018", "'") #left quotation mark
lyrics = lyrics.replace(u"\u02bc", "'") #a with dots on top
lyrics = lyrics.replace(u"\xe9", "e") #e with an accent
lyrics = lyrics.replace(u"\xe8", "e") #e with an backwards accent
lyrics = lyrics.replace(u"\xe0", "a") #a with an accent
lyrics = lyrics.replace(u"\u2026", "...") #ellipsis apparently
lyrics = lyrics.replace(u"\u2012", "-") #hyphen or dash
lyrics = lyrics.replace(u"\u2013", "-") #other type of hyphen or dash
lyrics = lyrics.replace(u"\u2014", "-") #other type of hyphen or dash
lyrics = lyrics.replace(u"\u201c", '"') #left double quote
lyrics = lyrics.replace(u"\u201d", '"') #right double quote
lyrics = lyrics.replace(u"\u200b", ' ') #zero width space ?
lyrics = lyrics.replace(u"\x92", "'") #different quote
lyrics = lyrics.replace(u"\x91", "'") #still different quote
lyrics = lyrics.replace(u"\xf1", "n") #n with tilde!
lyrics = lyrics.replace(u"\xed", "i") #i with accent
lyrics = lyrics.replace(u"\xe1", "a") #a with accent
lyrics = lyrics.replace(u"\xea", "e") #e with circumflex
lyrics = lyrics.replace(u"\xf3", "o") #o with accent
lyrics = lyrics.replace(u"\xb4", "") #just an accent, so remove
lyrics = lyrics.replace(u"\xeb", "e") #e with dots on top
lyrics = lyrics.replace(u"\xe4", "a") #a with dots on top
lyrics = lyrics.replace(u"\xe7", "c") #c with squigly bottom
return lyrics
if __name__ == "__main__":
for artist_name in artist_names:
#setting up path to artist's directories
artist_folder_path = "artists/%s" % artist_name
artist_lyrics_path = "%s/lyrics" % artist_folder_path
artist_info_path = "%s/info" % artist_folder_path
if not os.path.exists(artist_folder_path):
os.makedirs(artist_folder_path)
if not os.path.exists(artist_lyrics_path):
os.makedirs(artist_lyrics_path)
if not os.path.exists(artist_info_path):
os.makedirs(artist_info_path)
#only using lyrics since they're saved second
prev_song_ids = song_ids_already_scraped(artist_lyrics_path)
#find the artist!
search_url = base_url + "/search"
data = {'q': artist_name}
response = requests.get(search_url, data=data, headers=headers)
artist_info = response.json()
for hit in artist_info["response"]["hits"]:
song_api_path = hit["result"]["api_path"]
artist_api_path = artist_id_from_song_api_path(song_api_path, artist_name)
if artist_api_path: #done searching if we found the guy
break
if not artist_api_path:
print "Could not find %s" % artist_name
#find the song api ids for the artist
song_api_paths = songs_from_artist_api_path(artist_api_path)
#print out how many songs we have left
print len(song_api_paths) - len(prev_song_ids)
for song_api_path in song_api_paths:
api_id = song_api_path.split('/')[2]
if api_id in prev_song_ids:
continue #don't redo
full_song_info = info_from_song_api_path(song_api_path)
song_title = full_song_info["response"]["song"]["title"]
song_title_path = song_title.replace('/', '_')#.replace(' ', '_').lower()
song_web_path = full_song_info["response"]["song"]["path"]
lyrics = lyrics_from_song_web_path(song_web_path)
lyric_path = "%s/lyrics/%s.txt" % (artist_folder_path, song_title_path)
info_path = "%s/info/%s.txt" % (artist_folder_path, song_title_path)
#import pdb;pdb.set_trace()
#for record keeping purposes
#print (artist_folder_path, song_title_path, api_id)
print lyric_path
with open(info_path, "w") as lfile:
lfile.write(json.dumps(full_song_info))
with open(lyric_path, "w") as ifile:
try:
ifile.write(lyrics)
except UnicodeEncodeError as error:
print error
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment