Created
November 23, 2016 17:16
-
-
Save jackschultz/868ae6b7828acfa8760d83e582e59ccb to your computer and use it in GitHub Desktop.
For artist name in artist_names list, this will use Genius' API and website to download the info and lyrics (as best can be done with html scraping) into named folders in current directory. Need Geinus API Bearer token as well.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os, json | |
base_url = "http://api.genius.com" | |
headers = {'Authorization': 'Bearer GENIUS_API_BEARER_STRING'} | |
artist_names = ["Fleet Foxes"] | |
def artist_id_from_song_api_path(song_api_path, artist_name): | |
song_url = base_url + song_api_path | |
response = requests.get(song_url, headers=headers) | |
json = response.json() | |
artist = json["response"]["song"]["primary_artist"] | |
if artist["name"] == artist_name: | |
return artist["api_path"] | |
else: | |
return None | |
def songs_from_artist_api_path(artist_api_path): | |
api_paths = [] | |
artist_url = base_url + artist_api_path + "/songs" | |
data = {"per_page": 50} | |
while True: | |
response = requests.get(artist_url, data=data, headers=headers) | |
json = response.json() | |
songs = json["response"]["songs"] | |
for song in songs: | |
api_paths.append(song["api_path"]) | |
if len(songs) < 50: | |
break #no more songs for artist | |
else: | |
if "page" in data: | |
data["page"] = data["page"] + 1 | |
else: | |
data["page"] = 1 | |
return list(set(api_paths)) | |
def info_from_song_api_path(song_api_path): | |
song_url = base_url + song_api_path | |
response = requests.get(song_url, headers=headers) | |
json = response.json() | |
return json | |
def lyrics_from_song_web_path(song_web_path): | |
#gotta go regular scraping... come on Genius | |
page_url = "http://genius.com" + song_web_path | |
page = requests.get(page_url) | |
html = BeautifulSoup(page.text, "html.parser") | |
[h.extract() for h in html('script')] | |
lyrics = html.find("lyrics").get_text() | |
return clean_lyrics(lyrics) | |
def song_ids_already_scraped(artist_folder_path, force=False): | |
#check for ids already scraped so we don't redo | |
if force: | |
return [] | |
song_ids = [] | |
files = os.listdir(artist_folder_path) | |
for file_name in files: | |
dot_split = file_name.split('.') | |
#sometimes the file is empty, we don't want to include if that's the case | |
if dot_split[1] == 'txt': | |
try: | |
song_id = dot_split[0].split("_")[-1] | |
if os.path.getsize(artist_folder_path + '/' + file_name) != 0: | |
song_ids.append(song_id) | |
except: | |
pass | |
return song_ids | |
def clean_lyrics(lyrics): | |
lyrics = lyrics.replace(u"\u2019", "'") #right quotation mark | |
lyrics = lyrics.replace(u"\u2018", "'") #left quotation mark | |
lyrics = lyrics.replace(u"\u02bc", "'") #a with dots on top | |
lyrics = lyrics.replace(u"\xe9", "e") #e with an accent | |
lyrics = lyrics.replace(u"\xe8", "e") #e with an backwards accent | |
lyrics = lyrics.replace(u"\xe0", "a") #a with an accent | |
lyrics = lyrics.replace(u"\u2026", "...") #ellipsis apparently | |
lyrics = lyrics.replace(u"\u2012", "-") #hyphen or dash | |
lyrics = lyrics.replace(u"\u2013", "-") #other type of hyphen or dash | |
lyrics = lyrics.replace(u"\u2014", "-") #other type of hyphen or dash | |
lyrics = lyrics.replace(u"\u201c", '"') #left double quote | |
lyrics = lyrics.replace(u"\u201d", '"') #right double quote | |
lyrics = lyrics.replace(u"\u200b", ' ') #zero width space ? | |
lyrics = lyrics.replace(u"\x92", "'") #different quote | |
lyrics = lyrics.replace(u"\x91", "'") #still different quote | |
lyrics = lyrics.replace(u"\xf1", "n") #n with tilde! | |
lyrics = lyrics.replace(u"\xed", "i") #i with accent | |
lyrics = lyrics.replace(u"\xe1", "a") #a with accent | |
lyrics = lyrics.replace(u"\xea", "e") #e with circumflex | |
lyrics = lyrics.replace(u"\xf3", "o") #o with accent | |
lyrics = lyrics.replace(u"\xb4", "") #just an accent, so remove | |
lyrics = lyrics.replace(u"\xeb", "e") #e with dots on top | |
lyrics = lyrics.replace(u"\xe4", "a") #a with dots on top | |
lyrics = lyrics.replace(u"\xe7", "c") #c with squigly bottom | |
return lyrics | |
if __name__ == "__main__": | |
for artist_name in artist_names: | |
#setting up path to artist's directories | |
artist_folder_path = "artists/%s" % artist_name | |
artist_lyrics_path = "%s/lyrics" % artist_folder_path | |
artist_info_path = "%s/info" % artist_folder_path | |
if not os.path.exists(artist_folder_path): | |
os.makedirs(artist_folder_path) | |
if not os.path.exists(artist_lyrics_path): | |
os.makedirs(artist_lyrics_path) | |
if not os.path.exists(artist_info_path): | |
os.makedirs(artist_info_path) | |
#only using lyrics since they're saved second | |
prev_song_ids = song_ids_already_scraped(artist_lyrics_path) | |
#find the artist! | |
search_url = base_url + "/search" | |
data = {'q': artist_name} | |
response = requests.get(search_url, data=data, headers=headers) | |
artist_info = response.json() | |
for hit in artist_info["response"]["hits"]: | |
song_api_path = hit["result"]["api_path"] | |
artist_api_path = artist_id_from_song_api_path(song_api_path, artist_name) | |
if artist_api_path: #done searching if we found the guy | |
break | |
if not artist_api_path: | |
print "Could not find %s" % artist_name | |
#find the song api ids for the artist | |
song_api_paths = songs_from_artist_api_path(artist_api_path) | |
#print out how many songs we have left | |
print len(song_api_paths) - len(prev_song_ids) | |
for song_api_path in song_api_paths: | |
api_id = song_api_path.split('/')[2] | |
if api_id in prev_song_ids: | |
continue #don't redo | |
full_song_info = info_from_song_api_path(song_api_path) | |
song_title = full_song_info["response"]["song"]["title"] | |
song_title_path = song_title.replace('/', '_')#.replace(' ', '_').lower() | |
song_web_path = full_song_info["response"]["song"]["path"] | |
lyrics = lyrics_from_song_web_path(song_web_path) | |
lyric_path = "%s/lyrics/%s.txt" % (artist_folder_path, song_title_path) | |
info_path = "%s/info/%s.txt" % (artist_folder_path, song_title_path) | |
#import pdb;pdb.set_trace() | |
#for record keeping purposes | |
#print (artist_folder_path, song_title_path, api_id) | |
print lyric_path | |
with open(info_path, "w") as lfile: | |
lfile.write(json.dumps(full_song_info)) | |
with open(lyric_path, "w") as ifile: | |
try: | |
ifile.write(lyrics) | |
except UnicodeEncodeError as error: | |
print error |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment