Skip to content

Instantly share code, notes, and snippets.

@0187773933
Last active May 15, 2020 07:00
Show Gist options
  • Save 0187773933/ce87fcd05b07e8f3f04d8db04dcef1a5 to your computer and use it in GitHub Desktop.
Save 0187773933/ce87fcd05b07e8f3f04d8db04dcef1a5 to your computer and use it in GitHub Desktop.
Searches Genius.com Lyrics
import os
import sys
import re
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv, find_dotenv
# http://genius.com/api-clients
# https://docs.genius.com/#/getting-started-h1
# https://github.com/johnwmillr/LyricsGenius
class GeniusLyricSearch:
def __init__( self , options={} ):
if "client_access_token" not in options:
print( "You Need To Get a Client Access Token from:" )
print( "http://genius.com/api-clients" )
sys.exit( 1 )
self.client_access_token = options["client_access_token"]
self.batch_search_finished = True
# options = { max_workers: 10 , batch_list , function_reference: func_ref }
def batch_process( self , options ):
batch_size = len( options[ "batch_list" ] )
with ThreadPoolExecutor() as executor:
result_pool = list( tqdm( executor.map( options[ "function_reference" ] , iter( options[ "batch_list" ] ) ) , total=batch_size ) )
return result_pool
def search( self , search_term ):
headers = {
'accept': 'application/json, text/plain, */*',
}
params = (
( 'access_token' , self.client_access_token ) ,
( 'q' , search_term ) ,
)
response = requests.get( 'https://api.genius.com/search' , headers=headers , params=params )
return response.json()
def get_artist_info_from_id( self , artist_id ):
headers = {
'accept': 'application/json, text/plain, */*',
}
params = (
( 'access_token' , self.client_access_token ) ,
)
url = f"https://api.genius.com/artists/{ str( artist_id ) }"
print( url )
response = requests.get( url , headers=headers , params=params )
return response.json()
def get_songs_from_artist_id( self , options={} ):
if self.batch_search_finished == True:
return []
if "artist_id" not in options:
print( "no artist id given" )
return
if "per_page" not in options:
options["per_page"] = 20
if "page" not in options:
options["page"] = 1
headers = {
'accept': 'application/json, text/plain, */*',
}
params = (
( 'sort' , 'popularity' ) ,
( 'per_page' , options[ "per_page" ] ) ,
( 'page' , options[ "page" ] ) ,
( 'access_token' , self.client_access_token ) ,
)
url = f"https://api.genius.com/artists/{ str( options['artist_id'] ) }/songs"
response = requests.get( url , headers=headers , params=params )
result = response.json()
if result["response"]["next_page"] == None:
self.batch_search_finished = True
return result["response"]["songs"]
def enumerate_artist_songs( self , artist_id ):
self.batch_search_finished = False
batch_options_list = [ { "artist_id": artist_id , "per_page": 20 , "page": x } for x in range( 1 , 100 ) ]
songs = self.batch_process({
"max_workers": 5 ,
"function_reference": self.get_songs_from_artist_id ,
"batch_list": batch_options_list
})
self.batch_search_finished = True
return songs
# https://github.com/johnwmillr/LyricsGenius/blob/a65c0fb7b2a2c7f35fe004d390c2ea2253265c0f/lyricsgenius/api.py#L159
def scrape_song_lyrics_from_url( self , song_instance ):
page = requests.get( song_instance["url"] )
if page.status_code == 404:
return None
html = BeautifulSoup( page.text , "html.parser" )
div = html.find( "div" , class_="lyrics" )
if not div:
return None # Sometimes the lyrics section isn't found
# Scrape lyrics if proper section was found on page
lyrics = div.get_text()
remove_section_headers = False
if remove_section_headers: # Remove [Verse], [Bridge], etc.
lyrics = re.sub( '(\[.*?\])*' , '' , lyrics )
lyrics = re.sub( '\n{2}' , '\n' , lyrics ) # Gaps between verses
song_instance["lyrics"] = lyrics.strip( "\n" )
return song_instance
def get_all_songs_from_artist( self , artist_name ):
search_results = self.search( artist_name )
possible_artist_ids = list( map( lambda x: x[ "result" ][ "primary_artist" ][ "id" ] , search_results[ "response" ][ "hits" ] ) )
print( f"Gathering Songs from Artist ID: {str( possible_artist_ids[ 0 ] )}" )
artist_songs = self.enumerate_artist_songs( possible_artist_ids[ 0 ] )
# Flatten List of Lists
artist_songs = [ item for sublist in artist_songs for item in sublist ]
# Filter Songs that For Whatever Reason Don't Match Artist Id
artist_songs = [ i for i in artist_songs if i[ "primary_artist" ][ "id" ] == possible_artist_ids[ 0 ] ]
# Now Batch Process the Lyric Scrapping of Each Song
print( f"Scrapping Lyrics from {str(len(artist_songs))} Songs" )
artist_songs = self.batch_process({
"max_workers": 5 ,
"function_reference": self.scrape_song_lyrics_from_url ,
"batch_list": artist_songs
})
return artist_songs
if __name__ == "__main__":
load_dotenv( find_dotenv() )
lyric_searcher = GeniusLyricSearch({"client_access_token": os.environ[ "client_access_token" ]})
artist_songs = lyric_searcher.get_all_songs_from_artist( "Led Zeppelin" )
pprint( artist_songs )
print( str( len( artist_songs ) ) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment