dfreelon/top_BMMC_tracks_public.py

## top_BMMC_tracks_public.py
# -*- coding: utf-8 -*-

# run the following two snscrape commands from a CLI first -- you'll need the dev version from https://github.com/JustAnotherArchivist/snscrape
# future replicators: I recommend using Twitter's "since" and "until" date operators. I used snscrape's built in "since" operator because I ran it on June 30 so didn't need "until"
#for replies: snscrape --since 2023-05-31 --jsonl twitter-search @naima >@naima_replies_all.json
#for QTs: snscrape --since 2023-05-31 --jsonl twitter-search quoted_user_id:78417631 >@naima_qts_all.json

import html
import json
import pandas as pd
from pyyoutube import Api
import re
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

spotify_client_id = "" #you'll need proper Spotify and YT creds, obv
spotify_client_secret = ""
yt_api_key = ''
yt_long_regex = '(?:v=)(.+?)(?:&|$|\?)'
yt_short_regex = '(?:youtu.be/)(.+?)(?:&|$|/|\?)'
yt_extra_regex = '[^A-Za-z0-9-\':\",.& ].+'
vevo_user_regex = '(?<=[a-z])(?:)(?=[A-Z])'
pd.set_option('display.max_columns',None)
pd.set_option('display.max_colwidth',50)

def prep_field(field):
    trimmed = re.sub('[^A-Z0-9_ ]','',field.upper())
    trimmed = trimmed.strip().replace(' ','_')
    return re.sub('THE_|AND_','',trimmed)

naima = open('@naima_replies_all.json').read().splitlines()
naima_qt = open('@naima_qts_all.json').read().splitlines()
naima.extend(naima_qt)

#remove dupe tweets

n2 = []
uniq_ids = []
for i in naima:
    j = json.loads(i)
    if j['id'] not in uniq_ids:
        n2.append(j)
        uniq_ids.append(j['id'])

#parse out links into spotify and youtube

yt = []
spot = []

for i in n2:
    if i['links'] is not None:
        for j in i['links']:
            yt_id = None
            if 'open.spotify.com/track/' in j['url']:
                spot.append(j['url'])
            elif 'youtube.com/watch' in j['url']:
                yt_id = re.findall(yt_long_regex,j['url'])[0]
            elif 'youtu.be' in j['url']:
                yt_id = re.findall(yt_short_regex,j['url'])[0]
            if yt_id is not None:
                yt.append(yt_id)

#spotify processing

sp = spotipy.Spotify(client_credentials_manager =
                     SpotifyClientCredentials(client_id=spotify_client_id,
                                              client_secret=spotify_client_secret))

#spotify API makes you split your URLs into groups of 50

if len(spot) % 50 == 0:
    n_bins = len(spot) // 50
else:
    n_bins = (len(spot) // 50) + 1
spot_bins = []
for n in range(n_bins):
    spot_bins.append(spot[n*50:(n+1)*50])

spot_data = []
for b in spot_bins:
    spot_tmp = sp.tracks(b)['tracks']
    spot_data.extend(spot_tmp)

spotify_list = []
for n,i in enumerate(spot_data):
    spotify_list.append([])
    if i is not None:
        spotify_list[n].append(prep_field(i['artists'][0]['name']))
        spotify_list[n].append(prep_field(re.sub('( - |[^A-Za-z0-9\',& ]).+','',i['name'])))
        spotify_list[n].append(spot[n])
        spotify_list[n].append('spotify')
    else:
        print('No data for Spotify track #',n)

spotify_list = [i for i in spotify_list if i != []]

#youtube processing

yt_list = []
yt_ids = []
yt_users = []
api = Api(api_key=yt_api_key)

for y in yt:
    video_data = api.get_video_by_id(video_id=y)
    video_json = json.loads(video_data.to_json())
    try:
        title = video_json['items'][0]['snippet']['title']
        user = video_json['items'][0]['snippet']['channelTitle']
        user = user.replace('VEVO','')
        user = user.replace(' - Topic','')
        if ' ' not in user:
            user = ' '.join(re.split(vevo_user_regex,user))
        yt_list.append(title)
        yt_users.append(user)
        yt_ids.append(y)
    except IndexError:
        print('skipped ID',y)

#youtube split into artist/title

yt_list2 = []
for n,i in enumerate(yt_list):
    i_split = i.split(' - ') #if it uses VEVO-style title metadata, split it
    if len(i_split) > 1:
        i_split = i_split[:2]
        i_split[0] = prep_field(i_split[0])
        i_split[1] = re.sub(yt_extra_regex,'',i_split[1])
        i_split[1] = prep_field(i_split[1])
        i_split.append(yt_ids[n])
    else: #if not, use the channel title (user) as the artist name
        user = re.sub(yt_extra_regex,'',yt_users[n])
        user = prep_field(user)
        i_split.insert(0,user)
        i_split[1] = prep_field(i_split[1])
        i_split.append(yt_ids[n])
    yt_list2.append(i_split)

#merge yt and spotify data

sp_df = pd.DataFrame(spotify_list)
yt_df = pd.DataFrame(yt_list2)
yt_df[3] = 'youtube'
all_df = pd.concat([sp_df,yt_df]).reset_index(drop=True)
all_df.columns = ['artist','title','id','type']
all_df['joined'] = all_df.artist + '_' + all_df.title
all_df['joined'] = all_df.joined.apply(lambda x: x[1:] if x[0] == '_' else x)
uniq_df = all_df[['artist','title','joined']].drop_duplicates('joined')
uniq_list = uniq_df.values.tolist()
song_vc = all_df.joined.value_counts()
song_dict = dict(zip(song_vc.index,song_vc))
artist_vc = all_df.artist.value_counts()
artist_dict = dict(zip(artist_vc.index,artist_vc))

#add in tweet text mentions

tweet_text = [prep_field(html.unescape(i['renderedContent']))
              for i
              in n2]

for i in tweet_text:
    for j in uniq_list:
        if j[0] in i and j[1] in i:
            song_dict[j[2]] += 1
            artist_dict[j[0]] += 1

final_df = pd.DataFrame(zip(song_dict.keys(),song_dict.values()))
final_df.columns = ['song','ct']
final_df = final_df.sort_values('ct',ascending=False).reset_index(drop=True)
# final_df[final_df.ct >= 12].to_csv('top_BMMC_tracks.csv',index=False)
# if you want to create a playlist or dataviz from this, you should remove duplicates first
	# -- coding: utf-8 --

	# run the following two snscrape commands from a CLI first -- you'll need the dev version from https://github.com/JustAnotherArchivist/snscrape
	# future replicators: I recommend using Twitter's "since" and "until" date operators. I used snscrape's built in "since" operator because I ran it on June 30 so didn't need "until"
	#for replies: snscrape --since 2023-05-31 --jsonl twitter-search @naima >@naima_replies_all.json
	#for QTs: snscrape --since 2023-05-31 --jsonl twitter-search quoted_user_id:78417631 >@naima_qts_all.json

	import html
	import json
	import pandas as pd
	from pyyoutube import Api
	import re
	import spotipy
	from spotipy.oauth2 import SpotifyClientCredentials

	spotify_client_id = "" #you'll need proper Spotify and YT creds, obv
	spotify_client_secret = ""
	yt_api_key = ''
	yt_long_regex = '(?:v=)(.+?)(?:&\|$\|\?)'
	yt_short_regex = '(?:youtu.be/)(.+?)(?:&\|$\|/\|\?)'
	yt_extra_regex = '[^A-Za-z0-9-\':\",.& ].+'
	vevo_user_regex = '(?<=[a-z])(?:)(?=[A-Z])'
	pd.set_option('display.max_columns',None)
	pd.set_option('display.max_colwidth',50)

	def prep_field(field):
	trimmed = re.sub('[^A-Z0-9_ ]','',field.upper())
	trimmed = trimmed.strip().replace(' ','_')
	return re.sub('THE_\|AND_','',trimmed)

	naima = open('@naima_replies_all.json').read().splitlines()
	naima_qt = open('@naima_qts_all.json').read().splitlines()
	naima.extend(naima_qt)

	#remove dupe tweets

	n2 = []
	uniq_ids = []
	for i in naima:
	j = json.loads(i)
	if j['id'] not in uniq_ids:
	n2.append(j)
	uniq_ids.append(j['id'])

	#parse out links into spotify and youtube

	yt = []
	spot = []

	for i in n2:
	if i['links'] is not None:
	for j in i['links']:
	yt_id = None
	if 'open.spotify.com/track/' in j['url']:
	spot.append(j['url'])
	elif 'youtube.com/watch' in j['url']:
	yt_id = re.findall(yt_long_regex,j['url'])[0]
	elif 'youtu.be' in j['url']:
	yt_id = re.findall(yt_short_regex,j['url'])[0]
	if yt_id is not None:
	yt.append(yt_id)

	#spotify processing

	sp = spotipy.Spotify(client_credentials_manager =
	SpotifyClientCredentials(client_id=spotify_client_id,
	client_secret=spotify_client_secret))

	#spotify API makes you split your URLs into groups of 50

	if len(spot) % 50 == 0:
	n_bins = len(spot) // 50
	else:
	n_bins = (len(spot) // 50) + 1
	spot_bins = []
	for n in range(n_bins):
	spot_bins.append(spot[n50:(n+1)50])

	spot_data = []
	for b in spot_bins:
	spot_tmp = sp.tracks(b)['tracks']
	spot_data.extend(spot_tmp)

	spotify_list = []
	for n,i in enumerate(spot_data):
	spotify_list.append([])
	if i is not None:
	spotify_list[n].append(prep_field(i['artists'][0]['name']))
	spotify_list[n].append(prep_field(re.sub('( - \|[^A-Za-z0-9\',& ]).+','',i['name'])))
	spotify_list[n].append(spot[n])
	spotify_list[n].append('spotify')
	else:
	print('No data for Spotify track #',n)

	spotify_list = [i for i in spotify_list if i != []]

	#youtube processing

	yt_list = []
	yt_ids = []
	yt_users = []
	api = Api(api_key=yt_api_key)

	for y in yt:
	video_data = api.get_video_by_id(video_id=y)
	video_json = json.loads(video_data.to_json())
	try:
	title = video_json['items'][0]['snippet']['title']
	user = video_json['items'][0]['snippet']['channelTitle']
	user = user.replace('VEVO','')
	user = user.replace(' - Topic','')
	if ' ' not in user:
	user = ' '.join(re.split(vevo_user_regex,user))
	yt_list.append(title)
	yt_users.append(user)
	yt_ids.append(y)
	except IndexError:
	print('skipped ID',y)

	#youtube split into artist/title

	yt_list2 = []
	for n,i in enumerate(yt_list):
	i_split = i.split(' - ') #if it uses VEVO-style title metadata, split it
	if len(i_split) > 1:
	i_split = i_split[:2]
	i_split[0] = prep_field(i_split[0])
	i_split[1] = re.sub(yt_extra_regex,'',i_split[1])
	i_split[1] = prep_field(i_split[1])
	i_split.append(yt_ids[n])
	else: #if not, use the channel title (user) as the artist name
	user = re.sub(yt_extra_regex,'',yt_users[n])
	user = prep_field(user)
	i_split.insert(0,user)
	i_split[1] = prep_field(i_split[1])
	i_split.append(yt_ids[n])
	yt_list2.append(i_split)

	#merge yt and spotify data

	sp_df = pd.DataFrame(spotify_list)
	yt_df = pd.DataFrame(yt_list2)
	yt_df[3] = 'youtube'
	all_df = pd.concat([sp_df,yt_df]).reset_index(drop=True)
	all_df.columns = ['artist','title','id','type']
	all_df['joined'] = all_df.artist + '_' + all_df.title
	all_df['joined'] = all_df.joined.apply(lambda x: x[1:] if x[0] == '_' else x)
	uniq_df = all_df[['artist','title','joined']].drop_duplicates('joined')
	uniq_list = uniq_df.values.tolist()
	song_vc = all_df.joined.value_counts()
	song_dict = dict(zip(song_vc.index,song_vc))
	artist_vc = all_df.artist.value_counts()
	artist_dict = dict(zip(artist_vc.index,artist_vc))

	#add in tweet text mentions

	tweet_text = [prep_field(html.unescape(i['renderedContent']))
	for i
	in n2]

	for i in tweet_text:
	for j in uniq_list:
	if j[0] in i and j[1] in i:
	song_dict[j[2]] += 1
	artist_dict[j[0]] += 1

	final_df = pd.DataFrame(zip(song_dict.keys(),song_dict.values()))
	final_df.columns = ['song','ct']
	final_df = final_df.sort_values('ct',ascending=False).reset_index(drop=True)
	# final_df[final_df.ct >= 12].to_csv('top_BMMC_tracks.csv',index=False)
	# if you want to create a playlist or dataviz from this, you should remove duplicates first