Created
March 16, 2021 16:51
-
-
Save M4TTRX/630460cb8f6d4343da24de9ab8bd4f1c to your computer and use it in GitHub Desktop.
Gets lyrics for a given song and its artist using the Genius API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lyricsgenius import Genius | |
import pandas as pd | |
import numpy as np | |
from progress.bar import Bar | |
def check_song(song): | |
# empty songs should be ignored | |
if song is None: | |
return False | |
# If genius is the artist the song is likely some playlist , which will have invalid lyrics | |
if song.artist != "Genius": | |
return False | |
return True | |
def save_df(df): | |
data = df.copy() | |
# only keep the track ID and their lyrics | |
data = data.drop(["artist_name", "track_name"], axis=1) | |
# remove any row that has no lyrics to make the data smaller | |
data = data.dropna() | |
# save our data to a csv | |
data.to_csv("data/output/lyrics.csv", index=False) | |
def main(): | |
# import the music dataset | |
data = pd.read_csv("data/input/SpotifyFeatures.csv") | |
data = data[["track_id", "artist_name", "track_name"]] | |
# set the artists and song names in lower case for more accurate search | |
data["artist_name"] = data["artist_name"].str.lower() | |
data["track_name"] = data["track_name"].str.lower() | |
# create the new lyrics column | |
data["lyrics"] = np.nan | |
token = "j2qPuYiW82aU2D7gr_Yv0QECjGeVX91jk70qNfPhAz1jGoN8r6UgHgn5JhB7q6Tv" # edit here with your permanent token | |
if token == "": | |
token = input("Genius Access Token:") | |
# Initiate the genius API | |
genius = Genius(token, verbose=False) | |
# create a progress bar | |
max_iter = len(data) | |
progress_bar = Bar( | |
"Processing", | |
max=max_iter, | |
) | |
# get all lyrics | |
for index, row in data.iterrows(): | |
try: | |
song = genius.search_song(row.track_name, row.artist_name) | |
if check_song(song): | |
# remove the newline characters as they cause a mess in the csv | |
song_lyrics = song.lyrics.replace("\n", "[new-line]") | |
data["lyrics"][index] = song_lyrics | |
except: | |
pass | |
# backup the progress we did every now and then | |
if index % 1000 == 500 and index != 0: | |
print( | |
f"\nCurrently scanned {index} items, creating a backup...\n\n" | |
) | |
save_df(data) | |
progress_bar.next() | |
# finish off | |
save_df(data) | |
progress_bar.finish() | |
print(data.info()) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pandas | |
lyricsgenius | |
progress |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment