Skip to content

Instantly share code, notes, and snippets.

@ansate
Created August 1, 2013 18:48
Show Gist options
  • Save ansate/6134094 to your computer and use it in GitHub Desktop.
Save ansate/6134094 to your computer and use it in GitHub Desktop.
Example Python code for Generating Clusters of Etsy Listings
#!/usr/bin/python
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cluster import MiniBatchKMeans
from scipy import sparse
from scipy.spatial import distance
from time import time
import pandas as pd
import numpy as np
import sys
inputname = sys.argv[1]
outtrainname = inputname.replace(".csv", "_s3_training.csv")
outtestname = inputname.replace(".csv", "_s3_testing.csv")
print inputname
df = pd.read_csv(inputname)
df['tags'] = df['tags_str'].str.replace("_"," ")
df['text'] = df['title'] + " " + df['tags']
# set aside 10% of the data for testing
trainLen = int(len(df)*.9)
traindf = df[:trainLen]
testdf = df[trainLen:]
vectorizer = CountVectorizer(stop_words="english", max_df=0.95, min_df=20)
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(vectorizer.fit_transform(traindf['text']))
traindf['normed_price'] = (traindf['usd_price'] - traindf.usd_price.mean())/traindf.usd_price.std()
price_matrix = sparse.coo_matrix((np.array(traindf['normed_price'])), shape=(len(traindf['normed_price']),1))
plus_matrix = sparse.hstack([tfidf_matrix, price_matrix.transpose()])
km = MiniBatchKMeans(n_clusters=40, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(plus_matrix)
print("done in %0.3fs" % (time() - t0))
traindf['cluster'] = km.labels_
clusters = traindf['usd_price'].groupby(traindf['cluster'])
cstats = pd.DataFrame({'cluster': range(40),
'mean': clusters.mean(),
'median': clusters.median(),
'std': clusters.std(),
'count': clusters.count(),
'LQ': clusters.quantile(.25),
'UQ' : clusters.quantile(.75)})
print(cstats)
centroids = km.cluster_centers_
# pop the price column off the end
rcentroids = np.delete(centroids, centroids.shape[1]-1, 1)
# transform the test data
test_tfidf = transformer.transform(vectorizer.transform(testdf['text']))
# for each row in test_terms, find the closest row in rcentroids
labels = [np.argmin([distance.euclidean(row, cluster) for cluster in rcentroids]) for row in test_tfidf.toarray()]
testdf['cluster'] = labels
preds = pd.merge(testdf, cstats, on='cluster')
preds['pmeanerrsq'] = (preds['mean'] - preds['usd_price'])**2
preds['pmederrsq'] = (preds['median'] - preds['usd_price'])**2
print "Sum of squared error from the mean: %d" % sum(preds['pmeanerrsq'])
print "Sum of squared error from the median: %d" % sum(preds['pmederrsq'])
print "RMSE - mean: %.3f" % (sum(preds['pmeanerrsq'])/preds.shape[0])**.5
print "RMSE - median: %.3f" % (sum(preds['pmederrsq'])/preds.shape[0])**.5
traindf.to_csv(outtrainname, index=False)
testdf.to_csv(outtestname, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment