Last active
October 16, 2020 07:36
-
-
Save rvanbruggen/e78ef1002f2123822c1787c1e697acce to your computer and use it in GitHub Desktop.
Spotify Playlist Joy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spotipy | |
from neo4j import GraphDatabase | |
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth | |
# ------------------------------------ Configuration parameters ------------------------------------ # | |
user_id = "YOUR USER_ID" # Spotify user ID. | |
client = "YOUR CLIENT" # Spotify client ID. | |
secret = "YOUR SECRET" # Spotify client secret. | |
# playlist_uri = "spotify:playlist:1eCqsRrwBAFc2lf5ZLGa5m" # LONG original public playlist with songs to be sorted. | |
playlist_uri = "spotify:playlist:1BTunw40NV9HgFpLXQ7hpm" # SHORT original public playlist with songs to be sorted. | |
neo4j_url = "neo4j://localhost:7687" # bolt url of the neo4j database. | |
neo4j_username = "neo4j" # neo4j username. defaults to 'neo4j'. | |
neo4j_password = "changeme" # neo4j password. | |
scope = 'playlist-modify-private' # Spotify scope required to manage playlists. | |
redirect_uri = 'http://localhost:8888/callback' # Spotify callback url. Set to localhost for development. | |
cache_path = "spotify_cache.tmp" # Where spotify caches the session variables. | |
create_constraints = True # Whether to create constraints. | |
write_to_spotify = False # Whether to write back the generated playlists to spotify. | |
plot_kmeans_clusters = False # Whether to plot the kmeans clusters used for playlists. | |
min_playlist_size = 40 # Cut off for playlists to be grouped as 'misc' | |
playlist_split_limit = 160 # min size for playlists to be chopped up in smaller ones. | |
playlist_desc = 'Generated using neo4j-playlist-builder.' # Description of the generated playlists. | |
playlist_keywords_count = 3 # Number of keywords to use in dynamic playlist names. | |
playlist_prefix = '[NPB]' # Prefix to put in front of your spotify playlists. | |
filtered_keywords = '"pop", "mellow", "new", "rock", "folk"' # generic keywords to not include in playlist names | |
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=client, client_secret=secret)) | |
def load_graph_using_spotify_api(): | |
neo4j = create_neo4j_session(url=neo4j_url, user=neo4j_username, password=neo4j_password) | |
print("dropping and creating constraints...") | |
recreate_contraints(neo4j) | |
print("creating tracks...") | |
tracks = get_tracks() | |
tracks = get_track_audio_features(tracks) | |
neo4j.run("UNWIND $tracks as track CREATE (t:Track{id: track.id}) SET t = track", | |
parameters={'tracks': list(tracks.values())}) | |
print("creating albums...") | |
albums = get_album_info(tracks) | |
neo4j.run("UNWIND $albums as album CREATE (a:Album{id: a.id}) SET a = album", | |
parameters={'albums': list(albums.values())}) | |
print("creating artists...") | |
artists = get_artist_info(tracks) | |
neo4j.run("UNWIND $artists as artist CREATE (a:Artist{id: a.id}) SET a = artist", | |
parameters={'artists': list(artists.values())}) | |
print("finding related artists..") | |
related_artists = get_related_artists(artists) | |
neo4j.run("""UNWIND $relatedartists as artist MERGE (a:Artist {id: artist.id}) SET a = artist """, | |
parameters={'relatedartists': list(related_artists.values())}) | |
neo4j.run("""MATCH (a:Artist) WHERE EXISTS (a.original_artist) WITH a | |
MATCH (a2:Artist{id: a.original_artist}) | |
MERGE (a)-[:SPOTIFY_RELATES_TO]->(a2)""") | |
print("creating genres..") | |
genres = get_genres(albums, artists) | |
neo4j.run("UNWIND $genres as genre MERGE (g:Genre{name: genre})", | |
parameters={'genres': list(genres)}) | |
print("Linking tracks to albums, genres, and artists...") | |
neo4j.run("MATCH (t:Track), (a:Album{id: t.album}) CREATE (t)-[:TRACK_IN_ALBUM]->(a);") | |
neo4j.run("MATCH (t:Track) UNWIND t.artists as artist MATCH (a:Artist{id: artist}) CREATE (t)-[:TRACK_HAS_ARTIST]->(a)") | |
neo4j.run("MATCH (a:Artist) UNWIND a.genres as genre MATCH (g:Genre{name: genre}) CREATE (a)-[:ARTIST_HAS_GENRE]->(g)") | |
neo4j.run("MATCH (a1:Artist)<--(t:Track)-->(a2:Artist) WHERE id(a1)<id(a2) MERGE (a1)-[:WORKED_WITH {track:t.uri}]->(a2)") | |
neo4j.run("MATCH (ar:Artist)<--(t:Track)-->(al:Album) MERGE (al)-[:ALBUM_HAS_ARTIST]->(ar)") | |
print("Calculate artist similarity using GDS..") | |
neo4j.run(""" | |
MATCH (item:`Artist`)-[:`ARTIST_HAS_GENRE`]->(category:`Genre`) | |
WITH {item:id(item), categories: collect(distinct id(category))} as userData | |
WITH collect(userData) as dataset | |
CALL gds.alpha.similarity.overlap.write({ | |
data: dataset, | |
weightproperty: null, | |
nodeProjection: '*', | |
writeProperty: 'score', | |
writeRelationshipType: 'GDS_ARTIST_SIMILAR_OVERLAP', | |
similarityCutoff: 0.05, | |
degreeCutoff: 0 }) | |
YIELD nodes, similarityPairs, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100 | |
RETURN nodes, similarityPairs, writeRelationshipType, writeProperty, min, max, mean, p95""") | |
print("Calculate artist pagerank-spotify using GDS..") | |
neo4j.run(""" | |
CALL gds.pageRank.write({ | |
nodeProjection: 'Artist', | |
relationshipProjection: { | |
relType: { | |
type: 'SPOTIFY_RELATES_TO', | |
orientation: 'NATURAL', | |
properties: {} | |
} | |
}, | |
relationshipWeightProperty: null, | |
dampingFactor: 0.85, | |
maxIterations: 20, | |
writeProperty: 'pagerank-spotify'}) | |
""") | |
print("Calculate artist pagerank-workedwith using GDS..") | |
neo4j.run(""" | |
CALL gds.pageRank.write({ | |
nodeProjection: 'Artist', | |
relationshipProjection: { | |
relType: { | |
type: 'WORKED_WITH', | |
orientation: 'UNDIRECTED', | |
properties: {} | |
} | |
}, | |
relationshipWeightProperty: null, | |
dampingFactor: 0.85, | |
maxIterations: 20, | |
writeProperty: 'pagerank-workedwith' | |
}) | |
""") | |
print("Calculate artist pagerank-similarity using GDS..") | |
neo4j.run(""" | |
CALL gds.pageRank.write({ | |
nodeProjection: 'Artist', | |
relationshipProjection: { | |
relType: { | |
type: 'GDS_ARTIST_SIMILAR_OVERLAP', | |
orientation: 'UNDIRECTED', | |
properties: {} | |
} | |
}, | |
relationshipWeightProperty: null, | |
dampingFactor: 0.85, | |
maxIterations: 20, | |
writeProperty: 'pagerank-similarity' | |
}) | |
""") | |
print("Calculate artist Louvain community using GDS..") | |
neo4j.run(""" | |
CALL gds.louvain.write({ | |
nodeProjection: 'Artist', | |
relationshipProjection: { | |
relType: { | |
type: 'SPOTIFY_RELATES_TO', | |
orientation: 'UNDIRECTED', | |
properties: {} | |
} | |
}, | |
relationshipWeightProperty: null, | |
includeIntermediateCommunities: false, | |
seedProperty: 'valence', | |
nodeProperties: [ | |
'valence' | |
], | |
writeProperty: 'louvain-community'}) | |
""") | |
print("Done!") | |
def recreate_contraints(neo4j): | |
# recreate constraints / indices and clear existing database. | |
results = neo4j.run("CALL db.constraints") | |
for constraint in results: | |
result = neo4j.run("DROP " + constraint['description']) | |
neo4j.run("CREATE CONSTRAINT ON (g:Genre) ASSERT g.name IS UNIQUE") | |
neo4j.run("CREATE CONSTRAINT ON (a:Album) ASSERT a.id IS UNIQUE") | |
neo4j.run("CREATE CONSTRAINT ON (a:Artist) ASSERT a.id IS UNIQUE") | |
neo4j.run("CREATE CONSTRAINT ON (t:Track) ASSERT t.id IS UNIQUE") | |
neo4j.run("MATCH (n) DETACH DELETE n;") | |
def get_tracks(): | |
results = spotify.playlist(playlist_uri)['tracks'] | |
items = {} | |
while results['next'] or results['previous'] is None: | |
for track in results["items"]: | |
if track['track']['id']: | |
track['track']['artists'] = [artist if type(artist) == str else artist['id'] for artist in | |
track['track']['artists']] | |
track['track']['album'] = track['track']['album'] if type(track['track']['album']) == str else \ | |
track['track']['album']['id'] | |
items[track['track']['id']] = track['track'] | |
for field in track['track']: | |
if track is not None and type(track['track'][field]) == dict: | |
track['track'][field] = None | |
if not results['next']: | |
break | |
results = spotify.next(results) | |
return items | |
def get_track_audio_features(tracks, page_size=100): | |
page_count = len(tracks) / page_size | |
for i in range(int(page_count) + 1): | |
ids = list(tracks.keys())[i * page_size:(i + 1) * page_size] | |
if len(ids) == 0: | |
break | |
audio_features = spotify.audio_features(tracks=ids) | |
for track_features in audio_features: | |
if track_features is None: | |
continue | |
track_id = track_features['id'] | |
for feature, value in track_features.items(): | |
if feature != 'type': | |
tracks[track_id][feature] = value | |
return tracks | |
def get_album_info(tracks, page_size=20): | |
album_ids = set() | |
for track_id in tracks.keys(): | |
album_ids.add(tracks[track_id]['album']) | |
all_albums = {} | |
page_count = len(album_ids) / page_size | |
for i in range(int(page_count) + 1): | |
ids = list(album_ids)[i * page_size:(i + 1) * page_size] | |
results = spotify.albums(ids) | |
for album in results['albums']: | |
album['artists'] = [artist['id'] for artist in album['artists']] | |
album['images'] = album['images'][1]['url'] | |
album['external_ids'] = None | |
album['external_urls'] = None | |
album['tracks'] = len(album['tracks']) | |
album['copyrights'] = len(album['copyrights']) | |
all_albums[album['id']] = album | |
return all_albums | |
def get_artist_info(items, page_size=50): | |
all_artists = {} | |
artist_ids = set() | |
for track_id in items.keys(): | |
for artist_nr in items[track_id]['artists']: | |
artist_id = artist_nr | |
artist_ids.add(artist_id) | |
# after we have a list of all artists, get the details from the API | |
page_count = len(artist_ids) / page_size | |
for i in range(int(page_count) + 1): | |
ids = list(artist_ids)[i * page_size:(i + 1) * page_size] | |
results = spotify.artists(ids) | |
for artist in results['artists']: | |
if artist["images"]: | |
artist['images'] = artist['images'][1]['url'] | |
artist['followers'] = artist['followers']['total'] | |
artist['external_urls'] = None | |
all_artists[artist['id']] = artist | |
return all_artists | |
def get_related_artists(items, page_size=50): | |
page_count = len(items) / page_size | |
new_artists = {} | |
for i in range(int(page_count) + 1): | |
ids = list(items)[i * page_size:(i + 1) * page_size] | |
for id in ids: | |
related_artists = spotify.artist_related_artists(id) | |
for related_artist in related_artists['artists']: | |
related_artist['original_artist'] = id | |
if related_artist["images"]: | |
related_artist['images'] = related_artist['images'][1]['url'] | |
related_artist['followers'] = related_artist['followers']['total'] | |
related_artist['external_urls'] = None | |
new_artists[related_artist['id']] = related_artist | |
return new_artists | |
def get_genres(albums, artists): | |
genres = set() | |
for item in albums: | |
for genre in albums[item]['genres']: | |
genres.add(genre) | |
for item in artists: | |
for genre in artists[item]['genres']: | |
genres.add(genre) | |
return genres | |
def create_neo4j_session(url, user, password): | |
driver = GraphDatabase.driver(url, auth=(user, password)) | |
return driver.session() | |
if __name__ == '__main__': | |
load_graph_using_spotify_api() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment