Skip to content

Instantly share code, notes, and snippets.

@M4TTRX
Created March 16, 2021 16:51
Show Gist options
  • Save M4TTRX/630460cb8f6d4343da24de9ab8bd4f1c to your computer and use it in GitHub Desktop.
Save M4TTRX/630460cb8f6d4343da24de9ab8bd4f1c to your computer and use it in GitHub Desktop.
Gets lyrics for a given song and its artist using the Genius API
from lyricsgenius import Genius
import pandas as pd
import numpy as np
from progress.bar import Bar
def check_song(song):
# empty songs should be ignored
if song is None:
return False
# If genius is the artist the song is likely some playlist , which will have invalid lyrics
if song.artist != "Genius":
return False
return True
def save_df(df):
data = df.copy()
# only keep the track ID and their lyrics
data = data.drop(["artist_name", "track_name"], axis=1)
# remove any row that has no lyrics to make the data smaller
data = data.dropna()
# save our data to a csv
data.to_csv("data/output/lyrics.csv", index=False)
def main():
# import the music dataset
data = pd.read_csv("data/input/SpotifyFeatures.csv")
data = data[["track_id", "artist_name", "track_name"]]
# set the artists and song names in lower case for more accurate search
data["artist_name"] = data["artist_name"].str.lower()
data["track_name"] = data["track_name"].str.lower()
# create the new lyrics column
data["lyrics"] = np.nan
token = "j2qPuYiW82aU2D7gr_Yv0QECjGeVX91jk70qNfPhAz1jGoN8r6UgHgn5JhB7q6Tv" # edit here with your permanent token
if token == "":
token = input("Genius Access Token:")
# Initiate the genius API
genius = Genius(token, verbose=False)
# create a progress bar
max_iter = len(data)
progress_bar = Bar(
"Processing",
max=max_iter,
)
# get all lyrics
for index, row in data.iterrows():
try:
song = genius.search_song(row.track_name, row.artist_name)
if check_song(song):
# remove the newline characters as they cause a mess in the csv
song_lyrics = song.lyrics.replace("\n", "[new-line]")
data["lyrics"][index] = song_lyrics
except:
pass
# backup the progress we did every now and then
if index % 1000 == 500 and index != 0:
print(
f"\nCurrently scanned {index} items, creating a backup...\n\n"
)
save_df(data)
progress_bar.next()
# finish off
save_df(data)
progress_bar.finish()
print(data.info())
if __name__ == "__main__":
main()
pandas
lyricsgenius
progress
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment