Searches Lyrics
import os
import sys
import re
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv, find_dotenv
class GeniusLyricSearch:
def __init__( self , options={} ):
if "client_access_token" not in options:
print( "You Need To Get a Client Access Token from:" )
print( "" )
sys.exit( 1 )
self.client_access_token = options["client_access_token"]
self.batch_search_finished = True
# options = { max_workers: 10 , batch_list , function_reference: func_ref }
def batch_process( self , options ):
batch_size = len( options[ "batch_list" ] )
with ThreadPoolExecutor() as executor:
result_pool = list( tqdm( options[ "function_reference" ] , iter( options[ "batch_list" ] ) ) , total=batch_size ) )
return result_pool
def search( self , search_term ):
headers = {
'accept': 'application/json, text/plain, */*',
params = (
( 'access_token' , self.client_access_token ) ,
( 'q' , search_term ) ,
response = requests.get( '' , headers=headers , params=params )
return response.json()
def get_artist_info_from_id( self , artist_id ):
headers = {
'accept': 'application/json, text/plain, */*',
params = (
( 'access_token' , self.client_access_token ) ,
url = f"{ str( artist_id ) }"
print( url )
response = requests.get( url , headers=headers , params=params )
return response.json()
def get_songs_from_artist_id( self , options={} ):
if self.batch_search_finished == True:
return []
if "artist_id" not in options:
print( "no artist id given" )
if "per_page" not in options:
options["per_page"] = 20
if "page" not in options:
options["page"] = 1
headers = {
'accept': 'application/json, text/plain, */*',
params = (
( 'sort' , 'popularity' ) ,
( 'per_page' , options[ "per_page" ] ) ,
( 'page' , options[ "page" ] ) ,
( 'access_token' , self.client_access_token ) ,
url = f"{ str( options['artist_id'] ) }/songs"
response = requests.get( url , headers=headers , params=params )
result = response.json()
if result["response"]["next_page"] == None:
self.batch_search_finished = True
return result["response"]["songs"]
def enumerate_artist_songs( self , artist_id ):
self.batch_search_finished = False
batch_options_list = [ { "artist_id": artist_id , "per_page": 20 , "page": x } for x in range( 1 , 100 ) ]
songs = self.batch_process({
"max_workers": 5 ,
"function_reference": self.get_songs_from_artist_id ,
"batch_list": batch_options_list
self.batch_search_finished = True
return songs
def scrape_song_lyrics_from_url( self , song_instance ):
page = requests.get( song_instance["url"] )
if page.status_code == 404:
return None
html = BeautifulSoup( page.text , "html.parser" )
div = html.find( "div" , class_="lyrics" )
if not div:
return None # Sometimes the lyrics section isn't found
# Scrape lyrics if proper section was found on page
lyrics = div.get_text()
remove_section_headers = False
if remove_section_headers: # Remove [Verse], [Bridge], etc.
lyrics = re.sub( '(\[.*?\])*' , '' , lyrics )
lyrics = re.sub( '\n{2}' , '\n' , lyrics ) # Gaps between verses
song_instance["lyrics"] = lyrics.strip( "\n" )
return song_instance
def get_all_songs_from_artist( self , artist_name ):
search_results = artist_name )
possible_artist_ids = list( map( lambda x: x[ "result" ][ "primary_artist" ][ "id" ] , search_results[ "response" ][ "hits" ] ) )
print( f"Gathering Songs from Artist ID: {str( possible_artist_ids[ 0 ] )}" )
artist_songs = self.enumerate_artist_songs( possible_artist_ids[ 0 ] )
# Flatten List of Lists
artist_songs = [ item for sublist in artist_songs for item in sublist ]
# Filter Songs that For Whatever Reason Don't Match Artist Id
artist_songs = [ i for i in artist_songs if i[ "primary_artist" ][ "id" ] == possible_artist_ids[ 0 ] ]
# Now Batch Process the Lyric Scrapping of Each Song
print( f"Scrapping Lyrics from {str(len(artist_songs))} Songs" )
artist_songs = self.batch_process({
"max_workers": 5 ,
"function_reference": self.scrape_song_lyrics_from_url ,
"batch_list": artist_songs
return artist_songs
if __name__ == "__main__":
load_dotenv( find_dotenv() )
lyric_searcher = GeniusLyricSearch({"client_access_token": os.environ[ "client_access_token" ]})
artist_songs = lyric_searcher.get_all_songs_from_artist( "Led Zeppelin" )
pprint( artist_songs )
print( str( len( artist_songs ) ) )
