Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
lyric similarity analysis (n-gram tf-idf + agglomerative clustering)
from __future__ import print_function
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
import json
from sklearn import feature_extraction
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from scipy.cluster.hierarchy import *
import scipy
import sys
sys.setrecursionlimit(1500)
# lazy use of nltk for stemming and stopword removal
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer('english')
link_method = "complete"
# basic method for tokenizing and reducing words to stems
def tokenize_and_stem(text):
text = re.sub(r'[^\sa-zA-z\d]', " ", text)
text = re.sub(r'[\s]{2,}', " ", text)
tokens = text.split()
return [stemmer.stem(token) for token in tokens]
path = "path/to/corpus"
# corpus directory has individual text files for each band. each file is a collection of that band's lyrics, unprocessed
bands = []
all_lyrics = []
files = os.listdir(path)
# compile a list of all the lyrics across all bands
for fname in files:
bands.append(fname.split('.txt')[0])
infile = open("%s\\%s" % (path, fname), "r")
stream = infile.read()
all_lyrics.append(stream)
infile.close()
# generate tf-idf vectors on the entire corpus
tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(all_lyrics)
dist = 1 - cosine_similarity(tfidf_matrix)
# run hierarchical agglomerative clustering
distMatrix = scipy.spatial.distance.pdist(dist)
clusters = scipy.cluster.hierarchy.linkage(distMatrix, method=link_method)
T = scipy.cluster.hierarchy.to_tree(clusters, rd=False)
id2name = dict(zip(range(len(bands)), bands))
#clusters = linkage(dist, method='single')
#T = to_tree(clusters,rd=False)
# Create a nested dictionary from the ClusterNode's returned by SciPy
def add_node(node, parent ):
# First create the new node and append it to its parent's children
newNode = dict( node_id=node.id, children=[] )
parent["children"].append( newNode )
# Recursively add the current node's children
if node.left: add_node( node.left, newNode )
if node.right: add_node( node.right, newNode )
# Initialize nested dictionary for d3, then recursively iterate through tree
d3Dendro = dict(children=[], name="Root1")
add_node( T, d3Dendro )
# Label each node with the names of each leaf in its subtree
def label_tree( n ):
# If the node is a leaf, then we have its name
if len(n["children"]) == 0:
leafNames = [ id2name[n["node_id"]] ]
# If not, flatten all the leaves in the node's subtree
else:
leafNames =[ reduce(lambda ls, c: ls + label_tree(c), n["children"], [])]
# Delete the node id since we don't need it anymore and
# it makes for cleaner JSON
del n["node_id"]
# Labeling convention: "-"-separated leaf names
n["name"] = "-".join(sorted(map(str, leafNames)))
return leafNames
label_tree( d3Dendro["children"][0] )
# Output to JSON for D3
json.dump(d3Dendro, open("clusters.json" % link_method, "w"), sort_keys=True, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment