0187773933/LyricsSearch.py

## LyricsSearch.py
import os
import sys
import re
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

from dotenv import load_dotenv, find_dotenv

# http://genius.com/api-clients
# https://docs.genius.com/#/getting-started-h1
# https://github.com/johnwmillr/LyricsGenius

class GeniusLyricSearch:

	def __init__( self , options={} ):
		if "client_access_token" not in options:
			print( "You Need To Get a Client Access Token from:" )
			print( "http://genius.com/api-clients" )
			sys.exit( 1 )
		self.client_access_token = options["client_access_token"]
		self.batch_search_finished = True

	# options = { max_workers: 10 , batch_list , function_reference: func_ref }
	def batch_process( self , options ):
		batch_size = len( options[ "batch_list" ] )
		with ThreadPoolExecutor() as executor:
			result_pool = list( tqdm( executor.map( options[ "function_reference" ] , iter( options[ "batch_list" ] ) ) , total=batch_size ) )
			return result_pool

	def search( self , search_term ):
		headers = {
			'accept': 'application/json, text/plain, */*',
		}
		params = (
			( 'access_token' , self.client_access_token ) ,
			( 'q' , search_term ) ,
		)
		response = requests.get( 'https://api.genius.com/search' , headers=headers , params=params )
		return response.json()

	def get_artist_info_from_id( self , artist_id ):
		headers = {
			'accept': 'application/json, text/plain, */*',
		}
		params = (
			( 'access_token' , self.client_access_token ) ,
		)
		url = f"https://api.genius.com/artists/{ str( artist_id ) }"
		print( url )
		response = requests.get( url , headers=headers , params=params )
		return response.json()

	def get_songs_from_artist_id( self , options={} ):
		if self.batch_search_finished == True:
			return []
		if "artist_id" not in options:
			print( "no artist id given" )
			return
		if "per_page" not in options:
			options["per_page"] = 20
		if "page" not in options:
			options["page"] = 1

		headers = {
			'accept': 'application/json, text/plain, */*',
		}
		params = (
			( 'sort' , 'popularity' ) ,
			( 'per_page' , options[ "per_page" ] ) ,
			( 'page' , options[ "page" ] ) ,
			( 'access_token' , self.client_access_token ) ,
		)
		url = f"https://api.genius.com/artists/{ str( options['artist_id'] ) }/songs"
		response = requests.get( url , headers=headers , params=params )
		result = response.json()
		if result["response"]["next_page"] == None:
			self.batch_search_finished = True
		return result["response"]["songs"]

	def enumerate_artist_songs( self , artist_id ):
		self.batch_search_finished = False
		batch_options_list = [ { "artist_id": artist_id , "per_page": 20 , "page": x } for x in range( 1 , 100 ) ]
		songs = self.batch_process({
				"max_workers": 5 ,
				"function_reference": self.get_songs_from_artist_id ,
				"batch_list": batch_options_list
			})
		self.batch_search_finished = True
		return songs

	# https://github.com/johnwmillr/LyricsGenius/blob/a65c0fb7b2a2c7f35fe004d390c2ea2253265c0f/lyricsgenius/api.py#L159
	def scrape_song_lyrics_from_url( self , song_instance ):
		page = requests.get( song_instance["url"] )
		if page.status_code == 404:
			return None

		html = BeautifulSoup( page.text , "html.parser" )
		div = html.find( "div" , class_="lyrics" )
		if not div:
			return None # Sometimes the lyrics section isn't found

		# Scrape lyrics if proper section was found on page
		lyrics = div.get_text()
		remove_section_headers = False
		if remove_section_headers:  # Remove [Verse], [Bridge], etc.
			lyrics = re.sub( '(\[.*?\])*' , '' , lyrics )
			lyrics = re.sub( '\n{2}' , '\n' , lyrics )  # Gaps between verses
		song_instance["lyrics"] = lyrics.strip( "\n" )
		return song_instance

	def get_all_songs_from_artist( self , artist_name ):
		search_results = self.search( artist_name )
		possible_artist_ids = list( map( lambda x: x[ "result" ][ "primary_artist" ][ "id" ] , search_results[ "response" ][ "hits" ] ) )
		print( f"Gathering Songs from Artist ID: {str( possible_artist_ids[ 0 ] )}" )
		artist_songs = self.enumerate_artist_songs( possible_artist_ids[ 0 ] )

		# Flatten List of Lists
		artist_songs = [ item for sublist in artist_songs for item in sublist ]

		# Filter Songs that For Whatever Reason Don't Match Artist Id
		artist_songs = [ i for i in artist_songs if i[ "primary_artist" ][ "id" ] == possible_artist_ids[ 0 ] ]

		# Now Batch Process the Lyric Scrapping of Each Song
		print( f"Scrapping Lyrics from {str(len(artist_songs))} Songs" )
		artist_songs = self.batch_process({
				"max_workers": 5 ,
				"function_reference": self.scrape_song_lyrics_from_url ,
				"batch_list": artist_songs
			})
		return artist_songs


if __name__ == "__main__":

	load_dotenv( find_dotenv() )
	lyric_searcher = GeniusLyricSearch({"client_access_token": os.environ[ "client_access_token" ]})
	artist_songs = lyric_searcher.get_all_songs_from_artist( "Led Zeppelin" )
	pprint( artist_songs )
	print( str( len( artist_songs ) ) )
	import os
	import sys
	import re
	import requests
	from bs4 import BeautifulSoup
	from pprint import pprint
	from tqdm import tqdm
	from concurrent.futures import ThreadPoolExecutor

	from dotenv import load_dotenv, find_dotenv

	# http://genius.com/api-clients
	# https://docs.genius.com/#/getting-started-h1
	# https://github.com/johnwmillr/LyricsGenius

	class GeniusLyricSearch:

	def __init__( self , options={} ):
	if "client_access_token" not in options:
	print( "You Need To Get a Client Access Token from:" )
	print( "http://genius.com/api-clients" )
	sys.exit( 1 )
	self.client_access_token = options["client_access_token"]
	self.batch_search_finished = True

	# options = { max_workers: 10 , batch_list , function_reference: func_ref }
	def batch_process( self , options ):
	batch_size = len( options[ "batch_list" ] )
	with ThreadPoolExecutor() as executor:
	result_pool = list( tqdm( executor.map( options[ "function_reference" ] , iter( options[ "batch_list" ] ) ) , total=batch_size ) )
	return result_pool

	def search( self , search_term ):
	headers = {
	'accept': 'application/json, text/plain, /',
	}
	params = (
	( 'access_token' , self.client_access_token ) ,
	( 'q' , search_term ) ,
	)
	response = requests.get( 'https://api.genius.com/search' , headers=headers , params=params )
	return response.json()

	def get_artist_info_from_id( self , artist_id ):
	headers = {
	'accept': 'application/json, text/plain, /',
	}
	params = (
	( 'access_token' , self.client_access_token ) ,
	)
	url = f"https://api.genius.com/artists/{ str( artist_id ) }"
	print( url )
	response = requests.get( url , headers=headers , params=params )
	return response.json()

	def get_songs_from_artist_id( self , options={} ):
	if self.batch_search_finished == True:
	return []
	if "artist_id" not in options:
	print( "no artist id given" )
	return
	if "per_page" not in options:
	options["per_page"] = 20
	if "page" not in options:
	options["page"] = 1

	headers = {
	'accept': 'application/json, text/plain, /',
	}
	params = (
	( 'sort' , 'popularity' ) ,
	( 'per_page' , options[ "per_page" ] ) ,
	( 'page' , options[ "page" ] ) ,
	( 'access_token' , self.client_access_token ) ,
	)
	url = f"https://api.genius.com/artists/{ str( options['artist_id'] ) }/songs"
	response = requests.get( url , headers=headers , params=params )
	result = response.json()
	if result["response"]["next_page"] == None:
	self.batch_search_finished = True
	return result["response"]["songs"]

	def enumerate_artist_songs( self , artist_id ):
	self.batch_search_finished = False
	batch_options_list = [ { "artist_id": artist_id , "per_page": 20 , "page": x } for x in range( 1 , 100 ) ]
	songs = self.batch_process({
	"max_workers": 5 ,
	"function_reference": self.get_songs_from_artist_id ,
	"batch_list": batch_options_list
	})
	self.batch_search_finished = True
	return songs

	# https://github.com/johnwmillr/LyricsGenius/blob/a65c0fb7b2a2c7f35fe004d390c2ea2253265c0f/lyricsgenius/api.py#L159
	def scrape_song_lyrics_from_url( self , song_instance ):
	page = requests.get( song_instance["url"] )
	if page.status_code == 404:
	return None

	html = BeautifulSoup( page.text , "html.parser" )
	div = html.find( "div" , class_="lyrics" )
	if not div:
	return None # Sometimes the lyrics section isn't found

	# Scrape lyrics if proper section was found on page
	lyrics = div.get_text()
	remove_section_headers = False
	if remove_section_headers: # Remove [Verse], [Bridge], etc.
	lyrics = re.sub( '(\[.?\])' , '' , lyrics )
	lyrics = re.sub( '\n{2}' , '\n' , lyrics ) # Gaps between verses
	song_instance["lyrics"] = lyrics.strip( "\n" )
	return song_instance

	def get_all_songs_from_artist( self , artist_name ):
	search_results = self.search( artist_name )
	possible_artist_ids = list( map( lambda x: x[ "result" ][ "primary_artist" ][ "id" ] , search_results[ "response" ][ "hits" ] ) )
	print( f"Gathering Songs from Artist ID: {str( possible_artist_ids[ 0 ] )}" )
	artist_songs = self.enumerate_artist_songs( possible_artist_ids[ 0 ] )

	# Flatten List of Lists
	artist_songs = [ item for sublist in artist_songs for item in sublist ]

	# Filter Songs that For Whatever Reason Don't Match Artist Id
	artist_songs = [ i for i in artist_songs if i[ "primary_artist" ][ "id" ] == possible_artist_ids[ 0 ] ]

	# Now Batch Process the Lyric Scrapping of Each Song
	print( f"Scrapping Lyrics from {str(len(artist_songs))} Songs" )
	artist_songs = self.batch_process({
	"max_workers": 5 ,
	"function_reference": self.scrape_song_lyrics_from_url ,
	"batch_list": artist_songs
	})
	return artist_songs


	if __name__ == "__main__":

	load_dotenv( find_dotenv() )
	lyric_searcher = GeniusLyricSearch({"client_access_token": os.environ[ "client_access_token" ]})
	artist_songs = lyric_searcher.get_all_songs_from_artist( "Led Zeppelin" )
	pprint( artist_songs )
	print( str( len( artist_songs ) ) )