Created
          August 1, 2013 18:48 
        
      - 
      
 - 
        
Save ansate/6134094 to your computer and use it in GitHub Desktop.  
    Example Python code for Generating Clusters of Etsy Listings
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #!/usr/bin/python | |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
| from sklearn.cluster import MiniBatchKMeans | |
| from scipy import sparse | |
| from scipy.spatial import distance | |
| from time import time | |
| import pandas as pd | |
| import numpy as np | |
| import sys | |
| inputname = sys.argv[1] | |
| outtrainname = inputname.replace(".csv", "_s3_training.csv") | |
| outtestname = inputname.replace(".csv", "_s3_testing.csv") | |
| print inputname | |
| df = pd.read_csv(inputname) | |
| df['tags'] = df['tags_str'].str.replace("_"," ") | |
| df['text'] = df['title'] + " " + df['tags'] | |
| # set aside 10% of the data for testing | |
| trainLen = int(len(df)*.9) | |
| traindf = df[:trainLen] | |
| testdf = df[trainLen:] | |
| vectorizer = CountVectorizer(stop_words="english", max_df=0.95, min_df=20) | |
| transformer = TfidfTransformer() | |
| tfidf_matrix = transformer.fit_transform(vectorizer.fit_transform(traindf['text'])) | |
| traindf['normed_price'] = (traindf['usd_price'] - traindf.usd_price.mean())/traindf.usd_price.std() | |
| price_matrix = sparse.coo_matrix((np.array(traindf['normed_price'])), shape=(len(traindf['normed_price']),1)) | |
| plus_matrix = sparse.hstack([tfidf_matrix, price_matrix.transpose()]) | |
| km = MiniBatchKMeans(n_clusters=40, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False) | |
| print("Clustering sparse data with %s" % km) | |
| t0 = time() | |
| km.fit(plus_matrix) | |
| print("done in %0.3fs" % (time() - t0)) | |
| traindf['cluster'] = km.labels_ | |
| clusters = traindf['usd_price'].groupby(traindf['cluster']) | |
| cstats = pd.DataFrame({'cluster': range(40), | |
| 'mean': clusters.mean(), | |
| 'median': clusters.median(), | |
| 'std': clusters.std(), | |
| 'count': clusters.count(), | |
| 'LQ': clusters.quantile(.25), | |
| 'UQ' : clusters.quantile(.75)}) | |
| print(cstats) | |
| centroids = km.cluster_centers_ | |
| # pop the price column off the end | |
| rcentroids = np.delete(centroids, centroids.shape[1]-1, 1) | |
| # transform the test data | |
| test_tfidf = transformer.transform(vectorizer.transform(testdf['text'])) | |
| # for each row in test_terms, find the closest row in rcentroids | |
| labels = [np.argmin([distance.euclidean(row, cluster) for cluster in rcentroids]) for row in test_tfidf.toarray()] | |
| testdf['cluster'] = labels | |
| preds = pd.merge(testdf, cstats, on='cluster') | |
| preds['pmeanerrsq'] = (preds['mean'] - preds['usd_price'])**2 | |
| preds['pmederrsq'] = (preds['median'] - preds['usd_price'])**2 | |
| print "Sum of squared error from the mean: %d" % sum(preds['pmeanerrsq']) | |
| print "Sum of squared error from the median: %d" % sum(preds['pmederrsq']) | |
| print "RMSE - mean: %.3f" % (sum(preds['pmeanerrsq'])/preds.shape[0])**.5 | |
| print "RMSE - median: %.3f" % (sum(preds['pmederrsq'])/preds.shape[0])**.5 | |
| traindf.to_csv(outtrainname, index=False) | |
| testdf.to_csv(outtestname, index=False) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment