rvanbruggen/1-playlist-importer-and-analyser.py

## 1-playlist-importer-and-analyser.py
import spotipy
from neo4j import GraphDatabase
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

# ------------------------------------ Configuration parameters ------------------------------------ #
user_id = "YOUR USER_ID"                                  # Spotify user ID.
client = "YOUR CLIENT"                                    # Spotify client ID.
secret = "YOUR SECRET"                                    # Spotify client secret.
# playlist_uri = "spotify:playlist:1eCqsRrwBAFc2lf5ZLGa5m"  # LONG original public playlist with songs to be sorted.
playlist_uri = "spotify:playlist:1BTunw40NV9HgFpLXQ7hpm"  # SHORT original public playlist with songs to be sorted.
neo4j_url = "neo4j://localhost:7687"                      # bolt url of the neo4j database.
neo4j_username = "neo4j"                                  # neo4j username. defaults to 'neo4j'.
neo4j_password = "changeme"                               # neo4j password.
scope = 'playlist-modify-private'                         # Spotify scope required to manage playlists.
redirect_uri = 'http://localhost:8888/callback'           # Spotify callback url. Set to localhost for development.
cache_path = "spotify_cache.tmp"                          # Where spotify caches the session variables.
create_constraints = True                                 # Whether to create constraints.
write_to_spotify = False                                  # Whether to write back the generated playlists to spotify.
plot_kmeans_clusters = False                              # Whether to plot the kmeans clusters used for playlists.
min_playlist_size = 40                                    # Cut off for playlists to be grouped as 'misc'
playlist_split_limit = 160                                # min size for playlists to be chopped up in smaller ones.
playlist_desc = 'Generated using neo4j-playlist-builder.' # Description of the generated playlists.
playlist_keywords_count = 3                               # Number of keywords to use in dynamic playlist names.
playlist_prefix = '[NPB]'                                 # Prefix to put in front of your spotify playlists.
filtered_keywords = '"pop", "mellow", "new", "rock", "folk"' # generic keywords to not include in playlist names
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=client, client_secret=secret))

def load_graph_using_spotify_api():
    neo4j = create_neo4j_session(url=neo4j_url, user=neo4j_username, password=neo4j_password)
    print("dropping and creating constraints...")
    recreate_contraints(neo4j)

    print("creating tracks...")
    tracks = get_tracks()
    tracks = get_track_audio_features(tracks)
    neo4j.run("UNWIND $tracks as track CREATE (t:Track{id: track.id}) SET t = track",
              parameters={'tracks': list(tracks.values())})

    print("creating albums...")
    albums = get_album_info(tracks)
    neo4j.run("UNWIND $albums as album CREATE (a:Album{id: a.id}) SET a = album",
              parameters={'albums': list(albums.values())})

    print("creating artists...")
    artists = get_artist_info(tracks)
    neo4j.run("UNWIND $artists as artist CREATE (a:Artist{id: a.id}) SET a = artist",
              parameters={'artists': list(artists.values())})

    print("finding related artists..")
    related_artists = get_related_artists(artists)
    neo4j.run("""UNWIND $relatedartists as artist  MERGE (a:Artist {id: artist.id}) SET a = artist """,
              parameters={'relatedartists': list(related_artists.values())})
    neo4j.run("""MATCH (a:Artist) WHERE EXISTS (a.original_artist) WITH a
                   MATCH (a2:Artist{id: a.original_artist})
                   MERGE (a)-[:SPOTIFY_RELATES_TO]->(a2)""")

    print("creating genres..")
    genres = get_genres(albums, artists)
    neo4j.run("UNWIND $genres as genre MERGE (g:Genre{name: genre})",
              parameters={'genres': list(genres)})

    print("Linking tracks to albums, genres, and artists...")
    neo4j.run("MATCH (t:Track), (a:Album{id: t.album}) CREATE (t)-[:TRACK_IN_ALBUM]->(a);")
    neo4j.run("MATCH (t:Track) UNWIND t.artists as artist MATCH (a:Artist{id: artist}) CREATE (t)-[:TRACK_HAS_ARTIST]->(a)")
    neo4j.run("MATCH (a:Artist) UNWIND a.genres as genre MATCH (g:Genre{name: genre}) CREATE (a)-[:ARTIST_HAS_GENRE]->(g)")
    neo4j.run("MATCH (a1:Artist)<--(t:Track)-->(a2:Artist) WHERE id(a1)<id(a2) MERGE (a1)-[:WORKED_WITH {track:t.uri}]->(a2)")
    neo4j.run("MATCH (ar:Artist)<--(t:Track)-->(al:Album) MERGE (al)-[:ALBUM_HAS_ARTIST]->(ar)")

    print("Calculate artist similarity using GDS..")
    neo4j.run("""
        MATCH (item:`Artist`)-[:`ARTIST_HAS_GENRE`]->(category:`Genre`)
        WITH {item:id(item), categories: collect(distinct id(category))} as userData
        WITH collect(userData) as dataset
        CALL gds.alpha.similarity.overlap.write({
            data: dataset,
            weightproperty: null,
            nodeProjection: '*',
            writeProperty: 'score',
            writeRelationshipType: 'GDS_ARTIST_SIMILAR_OVERLAP',
            similarityCutoff: 0.05,
            degreeCutoff: 0 })
        YIELD nodes, similarityPairs, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100
        RETURN nodes, similarityPairs, writeRelationshipType, writeProperty, min, max, mean, p95""")

    print("Calculate artist pagerank-spotify using GDS..")
    neo4j.run("""
        CALL gds.pageRank.write({
            nodeProjection: 'Artist',
            relationshipProjection: {
                relType: {
                type: 'SPOTIFY_RELATES_TO',
                orientation: 'NATURAL',
                properties: {}
                }
            },
            relationshipWeightProperty: null,
            dampingFactor: 0.85,
            maxIterations: 20,
            writeProperty: 'pagerank-spotify'})
        """)

    print("Calculate artist pagerank-workedwith using GDS..")
    neo4j.run("""
        CALL gds.pageRank.write({
            nodeProjection: 'Artist',
            relationshipProjection: {
                relType: {
                type: 'WORKED_WITH',
                orientation: 'UNDIRECTED',
                properties: {}
                }
            },
            relationshipWeightProperty: null,
            dampingFactor: 0.85,
            maxIterations: 20,
            writeProperty: 'pagerank-workedwith'
            })
        """)

    print("Calculate artist pagerank-similarity using GDS..")
    neo4j.run("""
        CALL gds.pageRank.write({
        nodeProjection: 'Artist',
        relationshipProjection: {
            relType: {
            type: 'GDS_ARTIST_SIMILAR_OVERLAP',
            orientation: 'UNDIRECTED',
            properties: {}
            }
        },
        relationshipWeightProperty: null,
        dampingFactor: 0.85,
        maxIterations: 20,
        writeProperty: 'pagerank-similarity'
            })
        """)

    print("Calculate artist Louvain community using GDS..")
    neo4j.run("""
        CALL gds.louvain.write({
        nodeProjection: 'Artist',
        relationshipProjection: {
            relType: {
            type: 'SPOTIFY_RELATES_TO',
            orientation: 'UNDIRECTED',
            properties: {}
            }
        },
        relationshipWeightProperty: null,
        includeIntermediateCommunities: false,
        seedProperty: 'valence',
        nodeProperties: [
            'valence'
            ],
        writeProperty: 'louvain-community'})
        """)

print("Done!")


def recreate_contraints(neo4j):
    # recreate constraints / indices and clear existing database.
    results = neo4j.run("CALL db.constraints")
    for constraint in results:
        result = neo4j.run("DROP " + constraint['description'])
    neo4j.run("CREATE CONSTRAINT ON (g:Genre) ASSERT g.name IS UNIQUE")
    neo4j.run("CREATE CONSTRAINT ON (a:Album) ASSERT a.id IS UNIQUE")
    neo4j.run("CREATE CONSTRAINT ON (a:Artist) ASSERT a.id IS UNIQUE")
    neo4j.run("CREATE CONSTRAINT ON (t:Track) ASSERT t.id IS UNIQUE")
    neo4j.run("MATCH (n) DETACH DELETE n;")

def get_tracks():
    results = spotify.playlist(playlist_uri)['tracks']
    items = {}
    while results['next'] or results['previous'] is None:
        for track in results["items"]:
            if track['track']['id']:
                track['track']['artists'] = [artist if type(artist) == str else artist['id'] for artist in
                                             track['track']['artists']]
                track['track']['album'] = track['track']['album'] if type(track['track']['album']) == str else \
                    track['track']['album']['id']
                items[track['track']['id']] = track['track']
            for field in track['track']:
                if track is not None and type(track['track'][field]) == dict:
                    track['track'][field] = None
        if not results['next']:
            break
        results = spotify.next(results)
    return items


def get_track_audio_features(tracks, page_size=100):
    page_count = len(tracks) / page_size
    for i in range(int(page_count) + 1):
        ids = list(tracks.keys())[i * page_size:(i + 1) * page_size]
        if len(ids) == 0:
            break
        audio_features = spotify.audio_features(tracks=ids)
        for track_features in audio_features:
            if track_features is None:
                continue
            track_id = track_features['id']
            for feature, value in track_features.items():
                if feature != 'type':
                    tracks[track_id][feature] = value
    return tracks


def get_album_info(tracks, page_size=20):
    album_ids = set()
    for track_id in tracks.keys():
        album_ids.add(tracks[track_id]['album'])

    all_albums = {}
    page_count = len(album_ids) / page_size
    for i in range(int(page_count) + 1):
        ids = list(album_ids)[i * page_size:(i + 1) * page_size]
        results = spotify.albums(ids)

        for album in results['albums']:
            album['artists'] = [artist['id'] for artist in album['artists']]
            album['images'] = album['images'][1]['url']
            album['external_ids'] = None
            album['external_urls'] = None
            album['tracks'] = len(album['tracks'])
            album['copyrights'] = len(album['copyrights'])
            all_albums[album['id']] = album
    return all_albums


def get_artist_info(items, page_size=50):
    all_artists = {}
    artist_ids = set()
    for track_id in items.keys():
        for artist_nr in items[track_id]['artists']:
            artist_id = artist_nr
            artist_ids.add(artist_id)

    # after we have a list of all artists, get the details from the API
    page_count = len(artist_ids) / page_size
    for i in range(int(page_count) + 1):
        ids = list(artist_ids)[i * page_size:(i + 1) * page_size]
        results = spotify.artists(ids)
        for artist in results['artists']:
            if artist["images"]:
                artist['images'] = artist['images'][1]['url']
            artist['followers'] = artist['followers']['total']
            artist['external_urls'] = None
            all_artists[artist['id']] = artist
    return all_artists

def get_related_artists(items, page_size=50):
    page_count = len(items) / page_size
    new_artists = {}
    for i in range(int(page_count) + 1):
        ids = list(items)[i * page_size:(i + 1) * page_size]
        for id in ids:
            related_artists = spotify.artist_related_artists(id)
            for related_artist in related_artists['artists']:
                related_artist['original_artist'] = id
                if related_artist["images"]:
                    related_artist['images'] = related_artist['images'][1]['url']
                related_artist['followers'] = related_artist['followers']['total']
                related_artist['external_urls'] = None
                new_artists[related_artist['id']] = related_artist
    return new_artists

def get_genres(albums, artists):
    genres = set()
    for item in albums:
        for genre in albums[item]['genres']:
            genres.add(genre)
    for item in artists:
        for genre in artists[item]['genres']:
            genres.add(genre)
    return genres

def create_neo4j_session(url, user, password):
    driver = GraphDatabase.driver(url, auth=(user, password))
    return driver.session()

if __name__ == '__main__':
    load_graph_using_spotify_api()
	import spotipy
	from neo4j import GraphDatabase
	from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

	# ------------------------------------ Configuration parameters ------------------------------------ #
	user_id = "YOUR USER_ID" # Spotify user ID.
	client = "YOUR CLIENT" # Spotify client ID.
	secret = "YOUR SECRET" # Spotify client secret.
	# playlist_uri = "spotify:playlist:1eCqsRrwBAFc2lf5ZLGa5m" # LONG original public playlist with songs to be sorted.
	playlist_uri = "spotify:playlist:1BTunw40NV9HgFpLXQ7hpm" # SHORT original public playlist with songs to be sorted.
	neo4j_url = "neo4j://localhost:7687" # bolt url of the neo4j database.
	neo4j_username = "neo4j" # neo4j username. defaults to 'neo4j'.
	neo4j_password = "changeme" # neo4j password.
	scope = 'playlist-modify-private' # Spotify scope required to manage playlists.
	redirect_uri = 'http://localhost:8888/callback' # Spotify callback url. Set to localhost for development.
	cache_path = "spotify_cache.tmp" # Where spotify caches the session variables.
	create_constraints = True # Whether to create constraints.
	write_to_spotify = False # Whether to write back the generated playlists to spotify.
	plot_kmeans_clusters = False # Whether to plot the kmeans clusters used for playlists.
	min_playlist_size = 40 # Cut off for playlists to be grouped as 'misc'
	playlist_split_limit = 160 # min size for playlists to be chopped up in smaller ones.
	playlist_desc = 'Generated using neo4j-playlist-builder.' # Description of the generated playlists.
	playlist_keywords_count = 3 # Number of keywords to use in dynamic playlist names.
	playlist_prefix = '[NPB]' # Prefix to put in front of your spotify playlists.
	filtered_keywords = '"pop", "mellow", "new", "rock", "folk"' # generic keywords to not include in playlist names
	spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=client, client_secret=secret))

	def load_graph_using_spotify_api():
	neo4j = create_neo4j_session(url=neo4j_url, user=neo4j_username, password=neo4j_password)
	print("dropping and creating constraints...")
	recreate_contraints(neo4j)

	print("creating tracks...")
	tracks = get_tracks()
	tracks = get_track_audio_features(tracks)
	neo4j.run("UNWIND $tracks as track CREATE (t:Track{id: track.id}) SET t = track",
	parameters={'tracks': list(tracks.values())})

	print("creating albums...")
	albums = get_album_info(tracks)
	neo4j.run("UNWIND $albums as album CREATE (a:Album{id: a.id}) SET a = album",
	parameters={'albums': list(albums.values())})

	print("creating artists...")
	artists = get_artist_info(tracks)
	neo4j.run("UNWIND $artists as artist CREATE (a:Artist{id: a.id}) SET a = artist",
	parameters={'artists': list(artists.values())})

	print("finding related artists..")
	related_artists = get_related_artists(artists)
	neo4j.run("""UNWIND $relatedartists as artist MERGE (a:Artist {id: artist.id}) SET a = artist """,
	parameters={'relatedartists': list(related_artists.values())})
	neo4j.run("""MATCH (a:Artist) WHERE EXISTS (a.original_artist) WITH a
	MATCH (a2:Artist{id: a.original_artist})
	MERGE (a)-[:SPOTIFY_RELATES_TO]->(a2)""")

	print("creating genres..")
	genres = get_genres(albums, artists)
	neo4j.run("UNWIND $genres as genre MERGE (g:Genre{name: genre})",
	parameters={'genres': list(genres)})

	print("Linking tracks to albums, genres, and artists...")
	neo4j.run("MATCH (t:Track), (a:Album{id: t.album}) CREATE (t)-[:TRACK_IN_ALBUM]->(a);")
	neo4j.run("MATCH (t:Track) UNWIND t.artists as artist MATCH (a:Artist{id: artist}) CREATE (t)-[:TRACK_HAS_ARTIST]->(a)")
	neo4j.run("MATCH (a:Artist) UNWIND a.genres as genre MATCH (g:Genre{name: genre}) CREATE (a)-[:ARTIST_HAS_GENRE]->(g)")
	neo4j.run("MATCH (a1:Artist)<--(t:Track)-->(a2:Artist) WHERE id(a1)<id(a2) MERGE (a1)-[:WORKED_WITH {track:t.uri}]->(a2)")
	neo4j.run("MATCH (ar:Artist)<--(t:Track)-->(al:Album) MERGE (al)-[:ALBUM_HAS_ARTIST]->(ar)")

	print("Calculate artist similarity using GDS..")
	neo4j.run("""
	MATCH (item:`Artist`)-[:`ARTIST_HAS_GENRE`]->(category:`Genre`)
	WITH {item:id(item), categories: collect(distinct id(category))} as userData
	WITH collect(userData) as dataset
	CALL gds.alpha.similarity.overlap.write({
	data: dataset,
	weightproperty: null,
	nodeProjection: '*',
	writeProperty: 'score',
	writeRelationshipType: 'GDS_ARTIST_SIMILAR_OVERLAP',
	similarityCutoff: 0.05,
	degreeCutoff: 0 })
	YIELD nodes, similarityPairs, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100
	RETURN nodes, similarityPairs, writeRelationshipType, writeProperty, min, max, mean, p95""")

	print("Calculate artist pagerank-spotify using GDS..")
	neo4j.run("""
	CALL gds.pageRank.write({
	nodeProjection: 'Artist',
	relationshipProjection: {
	relType: {
	type: 'SPOTIFY_RELATES_TO',
	orientation: 'NATURAL',
	properties: {}
	}
	},
	relationshipWeightProperty: null,
	dampingFactor: 0.85,
	maxIterations: 20,
	writeProperty: 'pagerank-spotify'})
	""")

	print("Calculate artist pagerank-workedwith using GDS..")
	neo4j.run("""
	CALL gds.pageRank.write({
	nodeProjection: 'Artist',
	relationshipProjection: {
	relType: {
	type: 'WORKED_WITH',
	orientation: 'UNDIRECTED',
	properties: {}
	}
	},
	relationshipWeightProperty: null,
	dampingFactor: 0.85,
	maxIterations: 20,
	writeProperty: 'pagerank-workedwith'
	})
	""")

	print("Calculate artist pagerank-similarity using GDS..")
	neo4j.run("""
	CALL gds.pageRank.write({
	nodeProjection: 'Artist',
	relationshipProjection: {
	relType: {
	type: 'GDS_ARTIST_SIMILAR_OVERLAP',
	orientation: 'UNDIRECTED',
	properties: {}
	}
	},
	relationshipWeightProperty: null,
	dampingFactor: 0.85,
	maxIterations: 20,
	writeProperty: 'pagerank-similarity'
	})
	""")

	print("Calculate artist Louvain community using GDS..")
	neo4j.run("""
	CALL gds.louvain.write({
	nodeProjection: 'Artist',
	relationshipProjection: {
	relType: {
	type: 'SPOTIFY_RELATES_TO',
	orientation: 'UNDIRECTED',
	properties: {}
	}
	},
	relationshipWeightProperty: null,
	includeIntermediateCommunities: false,
	seedProperty: 'valence',
	nodeProperties: [
	'valence'
	],
	writeProperty: 'louvain-community'})
	""")

	print("Done!")


	def recreate_contraints(neo4j):
	# recreate constraints / indices and clear existing database.
	results = neo4j.run("CALL db.constraints")
	for constraint in results:
	result = neo4j.run("DROP " + constraint['description'])
	neo4j.run("CREATE CONSTRAINT ON (g:Genre) ASSERT g.name IS UNIQUE")
	neo4j.run("CREATE CONSTRAINT ON (a:Album) ASSERT a.id IS UNIQUE")
	neo4j.run("CREATE CONSTRAINT ON (a:Artist) ASSERT a.id IS UNIQUE")
	neo4j.run("CREATE CONSTRAINT ON (t:Track) ASSERT t.id IS UNIQUE")
	neo4j.run("MATCH (n) DETACH DELETE n;")

	def get_tracks():
	results = spotify.playlist(playlist_uri)['tracks']
	items = {}
	while results['next'] or results['previous'] is None:
	for track in results["items"]:
	if track['track']['id']:
	track['track']['artists'] = [artist if type(artist) == str else artist['id'] for artist in
	track['track']['artists']]
	track['track']['album'] = track['track']['album'] if type(track['track']['album']) == str else \
	track['track']['album']['id']
	items[track['track']['id']] = track['track']
	for field in track['track']:
	if track is not None and type(track['track'][field]) == dict:
	track['track'][field] = None
	if not results['next']:
	break
	results = spotify.next(results)
	return items


	def get_track_audio_features(tracks, page_size=100):
	page_count = len(tracks) / page_size
	for i in range(int(page_count) + 1):
	ids = list(tracks.keys())[i * page_size:(i + 1) * page_size]
	if len(ids) == 0:
	break
	audio_features = spotify.audio_features(tracks=ids)
	for track_features in audio_features:
	if track_features is None:
	continue
	track_id = track_features['id']
	for feature, value in track_features.items():
	if feature != 'type':
	tracks[track_id][feature] = value
	return tracks


	def get_album_info(tracks, page_size=20):
	album_ids = set()
	for track_id in tracks.keys():
	album_ids.add(tracks[track_id]['album'])

	all_albums = {}
	page_count = len(album_ids) / page_size
	for i in range(int(page_count) + 1):
	ids = list(album_ids)[i * page_size:(i + 1) * page_size]
	results = spotify.albums(ids)

	for album in results['albums']:
	album['artists'] = [artist['id'] for artist in album['artists']]
	album['images'] = album['images'][1]['url']
	album['external_ids'] = None
	album['external_urls'] = None
	album['tracks'] = len(album['tracks'])
	album['copyrights'] = len(album['copyrights'])
	all_albums[album['id']] = album
	return all_albums


	def get_artist_info(items, page_size=50):
	all_artists = {}
	artist_ids = set()
	for track_id in items.keys():
	for artist_nr in items[track_id]['artists']:
	artist_id = artist_nr
	artist_ids.add(artist_id)

	# after we have a list of all artists, get the details from the API
	page_count = len(artist_ids) / page_size
	for i in range(int(page_count) + 1):
	ids = list(artist_ids)[i * page_size:(i + 1) * page_size]
	results = spotify.artists(ids)
	for artist in results['artists']:
	if artist["images"]:
	artist['images'] = artist['images'][1]['url']
	artist['followers'] = artist['followers']['total']
	artist['external_urls'] = None
	all_artists[artist['id']] = artist
	return all_artists

	def get_related_artists(items, page_size=50):
	page_count = len(items) / page_size
	new_artists = {}
	for i in range(int(page_count) + 1):
	ids = list(items)[i * page_size:(i + 1) * page_size]
	for id in ids:
	related_artists = spotify.artist_related_artists(id)
	for related_artist in related_artists['artists']:
	related_artist['original_artist'] = id
	if related_artist["images"]:
	related_artist['images'] = related_artist['images'][1]['url']
	related_artist['followers'] = related_artist['followers']['total']
	related_artist['external_urls'] = None
	new_artists[related_artist['id']] = related_artist
	return new_artists

	def get_genres(albums, artists):
	genres = set()
	for item in albums:
	for genre in albums[item]['genres']:
	genres.add(genre)
	for item in artists:
	for genre in artists[item]['genres']:
	genres.add(genre)
	return genres

	def create_neo4j_session(url, user, password):
	driver = GraphDatabase.driver(url, auth=(user, password))
	return driver.session()

	if __name__ == '__main__':
	load_graph_using_spotify_api()