ansate/gist:6134094

## gistfile1.py
#!/usr/bin/python
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cluster import MiniBatchKMeans
from scipy import sparse
from scipy.spatial import distance

from time import time

import pandas as pd
import numpy as np

import sys

inputname = sys.argv[1]
outtrainname = inputname.replace(".csv", "_s3_training.csv")
outtestname = inputname.replace(".csv", "_s3_testing.csv")

print inputname
df = pd.read_csv(inputname)
df['tags'] = df['tags_str'].str.replace("_"," ")
df['text'] = df['title'] + " " + df['tags']

# set aside 10% of the data for testing
trainLen = int(len(df)*.9)
traindf = df[:trainLen]
testdf = df[trainLen:]

vectorizer = CountVectorizer(stop_words="english", max_df=0.95, min_df=20)
transformer = TfidfTransformer()

tfidf_matrix = transformer.fit_transform(vectorizer.fit_transform(traindf['text']))
traindf['normed_price'] = (traindf['usd_price'] - traindf.usd_price.mean())/traindf.usd_price.std()
price_matrix = sparse.coo_matrix((np.array(traindf['normed_price'])), shape=(len(traindf['normed_price']),1))
plus_matrix = sparse.hstack([tfidf_matrix, price_matrix.transpose()])

km = MiniBatchKMeans(n_clusters=40, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(plus_matrix)
print("done in %0.3fs" % (time() - t0))

traindf['cluster'] = km.labels_

clusters = traindf['usd_price'].groupby(traindf['cluster'])

cstats = pd.DataFrame({'cluster': range(40),
                    'mean': clusters.mean(),
                    'median': clusters.median(),
                    'std': clusters.std(),
                    'count': clusters.count(),
                    'LQ': clusters.quantile(.25),
                    'UQ' : clusters.quantile(.75)})
print(cstats)
centroids = km.cluster_centers_
# pop the price column off the end
rcentroids = np.delete(centroids, centroids.shape[1]-1, 1)

# transform the test data
test_tfidf = transformer.transform(vectorizer.transform(testdf['text']))

# for each row in test_terms, find the closest row in rcentroids
labels = [np.argmin([distance.euclidean(row, cluster) for cluster in rcentroids]) for row in test_tfidf.toarray()]
testdf['cluster'] = labels

preds = pd.merge(testdf, cstats, on='cluster')

preds['pmeanerrsq'] = (preds['mean'] - preds['usd_price'])**2
preds['pmederrsq'] = (preds['median'] - preds['usd_price'])**2

print "Sum of squared error from the mean: %d" % sum(preds['pmeanerrsq'])
print "Sum of squared error from the median: %d" % sum(preds['pmederrsq'])
print "RMSE - mean: %.3f" % (sum(preds['pmeanerrsq'])/preds.shape[0])**.5
print "RMSE - median: %.3f" % (sum(preds['pmederrsq'])/preds.shape[0])**.5

traindf.to_csv(outtrainname, index=False)
testdf.to_csv(outtestname, index=False)
	#!/usr/bin/python
	from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
	from sklearn.cluster import MiniBatchKMeans
	from scipy import sparse
	from scipy.spatial import distance

	from time import time

	import pandas as pd
	import numpy as np

	import sys

	inputname = sys.argv[1]
	outtrainname = inputname.replace(".csv", "_s3_training.csv")
	outtestname = inputname.replace(".csv", "_s3_testing.csv")

	print inputname
	df = pd.read_csv(inputname)
	df['tags'] = df['tags_str'].str.replace("_"," ")
	df['text'] = df['title'] + " " + df['tags']

	# set aside 10% of the data for testing
	trainLen = int(len(df)*.9)
	traindf = df[:trainLen]
	testdf = df[trainLen:]

	vectorizer = CountVectorizer(stop_words="english", max_df=0.95, min_df=20)
	transformer = TfidfTransformer()

	tfidf_matrix = transformer.fit_transform(vectorizer.fit_transform(traindf['text']))
	traindf['normed_price'] = (traindf['usd_price'] - traindf.usd_price.mean())/traindf.usd_price.std()
	price_matrix = sparse.coo_matrix((np.array(traindf['normed_price'])), shape=(len(traindf['normed_price']),1))
	plus_matrix = sparse.hstack([tfidf_matrix, price_matrix.transpose()])

	km = MiniBatchKMeans(n_clusters=40, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False)

	print("Clustering sparse data with %s" % km)
	t0 = time()
	km.fit(plus_matrix)
	print("done in %0.3fs" % (time() - t0))

	traindf['cluster'] = km.labels_

	clusters = traindf['usd_price'].groupby(traindf['cluster'])

	cstats = pd.DataFrame({'cluster': range(40),
	'mean': clusters.mean(),
	'median': clusters.median(),
	'std': clusters.std(),
	'count': clusters.count(),
	'LQ': clusters.quantile(.25),
	'UQ' : clusters.quantile(.75)})
	print(cstats)
	centroids = km.cluster_centers_
	# pop the price column off the end
	rcentroids = np.delete(centroids, centroids.shape[1]-1, 1)

	# transform the test data
	test_tfidf = transformer.transform(vectorizer.transform(testdf['text']))

	# for each row in test_terms, find the closest row in rcentroids
	labels = [np.argmin([distance.euclidean(row, cluster) for cluster in rcentroids]) for row in test_tfidf.toarray()]
	testdf['cluster'] = labels

	preds = pd.merge(testdf, cstats, on='cluster')

	preds['pmeanerrsq'] = (preds['mean'] - preds['usd_price'])**2
	preds['pmederrsq'] = (preds['median'] - preds['usd_price'])**2

	print "Sum of squared error from the mean: %d" % sum(preds['pmeanerrsq'])
	print "Sum of squared error from the median: %d" % sum(preds['pmederrsq'])
	print "RMSE - mean: %.3f" % (sum(preds['pmeanerrsq'])/preds.shape[0])**.5
	print "RMSE - median: %.3f" % (sum(preds['pmederrsq'])/preds.shape[0])**.5

	traindf.to_csv(outtrainname, index=False)
	testdf.to_csv(outtestname, index=False)