Skip to content

Instantly share code, notes, and snippets.

@dfreelon
Created July 1, 2023 14:02
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dfreelon/a70c85796dd6f450d03f4d06ceb55739 to your computer and use it in GitHub Desktop.
Save dfreelon/a70c85796dd6f450d03f4d06ceb55739 to your computer and use it in GitHub Desktop.
The code I used to create the #BlackMusicMonthChallenge top tracks playlists for Spotify and YouTube. The code requires valid Spotify and YouTube API credentials.
# -*- coding: utf-8 -*-
# run the following two snscrape commands from a CLI first -- you'll need the dev version from https://github.com/JustAnotherArchivist/snscrape
# future replicators: I recommend using Twitter's "since" and "until" date operators. I used snscrape's built in "since" operator because I ran it on June 30 so didn't need "until"
#for replies: snscrape --since 2023-05-31 --jsonl twitter-search @naima >@naima_replies_all.json
#for QTs: snscrape --since 2023-05-31 --jsonl twitter-search quoted_user_id:78417631 >@naima_qts_all.json
import html
import json
import pandas as pd
from pyyoutube import Api
import re
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
spotify_client_id = "" #you'll need proper Spotify and YT creds, obv
spotify_client_secret = ""
yt_api_key = ''
yt_long_regex = '(?:v=)(.+?)(?:&|$|\?)'
yt_short_regex = '(?:youtu.be/)(.+?)(?:&|$|/|\?)'
yt_extra_regex = '[^A-Za-z0-9-\':\",.& ].+'
vevo_user_regex = '(?<=[a-z])(?:)(?=[A-Z])'
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth',50)
def prep_field(field):
trimmed = re.sub('[^A-Z0-9_ ]','',field.upper())
trimmed = trimmed.strip().replace(' ','_')
return re.sub('THE_|AND_','',trimmed)
naima = open('@naima_replies_all.json').read().splitlines()
naima_qt = open('@naima_qts_all.json').read().splitlines()
naima.extend(naima_qt)
#remove dupe tweets
n2 = []
uniq_ids = []
for i in naima:
j = json.loads(i)
if j['id'] not in uniq_ids:
n2.append(j)
uniq_ids.append(j['id'])
#parse out links into spotify and youtube
yt = []
spot = []
for i in n2:
if i['links'] is not None:
for j in i['links']:
yt_id = None
if 'open.spotify.com/track/' in j['url']:
spot.append(j['url'])
elif 'youtube.com/watch' in j['url']:
yt_id = re.findall(yt_long_regex,j['url'])[0]
elif 'youtu.be' in j['url']:
yt_id = re.findall(yt_short_regex,j['url'])[0]
if yt_id is not None:
yt.append(yt_id)
#spotify processing
sp = spotipy.Spotify(client_credentials_manager =
SpotifyClientCredentials(client_id=spotify_client_id,
client_secret=spotify_client_secret))
#spotify API makes you split your URLs into groups of 50
if len(spot) % 50 == 0:
n_bins = len(spot) // 50
else:
n_bins = (len(spot) // 50) + 1
spot_bins = []
for n in range(n_bins):
spot_bins.append(spot[n*50:(n+1)*50])
spot_data = []
for b in spot_bins:
spot_tmp = sp.tracks(b)['tracks']
spot_data.extend(spot_tmp)
spotify_list = []
for n,i in enumerate(spot_data):
spotify_list.append([])
if i is not None:
spotify_list[n].append(prep_field(i['artists'][0]['name']))
spotify_list[n].append(prep_field(re.sub('( - |[^A-Za-z0-9\',& ]).+','',i['name'])))
spotify_list[n].append(spot[n])
spotify_list[n].append('spotify')
else:
print('No data for Spotify track #',n)
spotify_list = [i for i in spotify_list if i != []]
#youtube processing
yt_list = []
yt_ids = []
yt_users = []
api = Api(api_key=yt_api_key)
for y in yt:
video_data = api.get_video_by_id(video_id=y)
video_json = json.loads(video_data.to_json())
try:
title = video_json['items'][0]['snippet']['title']
user = video_json['items'][0]['snippet']['channelTitle']
user = user.replace('VEVO','')
user = user.replace(' - Topic','')
if ' ' not in user:
user = ' '.join(re.split(vevo_user_regex,user))
yt_list.append(title)
yt_users.append(user)
yt_ids.append(y)
except IndexError:
print('skipped ID',y)
#youtube split into artist/title
yt_list2 = []
for n,i in enumerate(yt_list):
i_split = i.split(' - ') #if it uses VEVO-style title metadata, split it
if len(i_split) > 1:
i_split = i_split[:2]
i_split[0] = prep_field(i_split[0])
i_split[1] = re.sub(yt_extra_regex,'',i_split[1])
i_split[1] = prep_field(i_split[1])
i_split.append(yt_ids[n])
else: #if not, use the channel title (user) as the artist name
user = re.sub(yt_extra_regex,'',yt_users[n])
user = prep_field(user)
i_split.insert(0,user)
i_split[1] = prep_field(i_split[1])
i_split.append(yt_ids[n])
yt_list2.append(i_split)
#merge yt and spotify data
sp_df = pd.DataFrame(spotify_list)
yt_df = pd.DataFrame(yt_list2)
yt_df[3] = 'youtube'
all_df = pd.concat([sp_df,yt_df]).reset_index(drop=True)
all_df.columns = ['artist','title','id','type']
all_df['joined'] = all_df.artist + '_' + all_df.title
all_df['joined'] = all_df.joined.apply(lambda x: x[1:] if x[0] == '_' else x)
uniq_df = all_df[['artist','title','joined']].drop_duplicates('joined')
uniq_list = uniq_df.values.tolist()
song_vc = all_df.joined.value_counts()
song_dict = dict(zip(song_vc.index,song_vc))
artist_vc = all_df.artist.value_counts()
artist_dict = dict(zip(artist_vc.index,artist_vc))
#add in tweet text mentions
tweet_text = [prep_field(html.unescape(i['renderedContent']))
for i
in n2]
for i in tweet_text:
for j in uniq_list:
if j[0] in i and j[1] in i:
song_dict[j[2]] += 1
artist_dict[j[0]] += 1
final_df = pd.DataFrame(zip(song_dict.keys(),song_dict.values()))
final_df.columns = ['song','ct']
final_df = final_df.sort_values('ct',ascending=False).reset_index(drop=True)
# final_df[final_df.ct >= 12].to_csv('top_BMMC_tracks.csv',index=False)
# if you want to create a playlist or dataviz from this, you should remove duplicates first
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment