Skip to content

Instantly share code, notes, and snippets.

@chemikadze
Last active April 9, 2017 02:09
Show Gist options
  • Save chemikadze/f3a1deca6fe8c0822ab81960911361c1 to your computer and use it in GitHub Desktop.
Save chemikadze/f3a1deca6fe8c0822ab81960911361c1 to your computer and use it in GitHub Desktop.
Strava ride clustering
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.linalg.Vectors
from numpy import isnan
from IPython.lib.pretty import pprint
# hypotesis: mtb offroad normal, mtb road normal, mtb flat intervals, mtb hill intervals, road normal
def do_clustering(sc, sqlCtx):
ss = sqlCtx.sparkSession
activities = ss.read.json("gs://chemikadze/strava-analysis/activities_full.json")
train_data = activities.rdd \
.filter(lambda r: r.average_heartrate > 60 and r.distance > 5 and r.type == "Ride") \
.map(lambda r: Vectors.dense(r.average_speed * 3.6, r.average_heartrate, r.total_elevation_gain / r.distance * 1000)) \
.persist()
num_iterations = 50
for num_clusters in xrange(1, 15):
clusters = KMeans.train(train_data, num_clusters, num_iterations)
wsse = clusters.computeCost(train_data)
print("=====================")
print("clusters: %s wsse: %s" % (num_clusters, wsse))
pprint(clusters.clusterCenters)
print("")
=====================
clusters: 1 wsse: 93584.7642615
[array([ 18.49169096, 141.5290411 , 9.57007626])]
=====================
clusters: 2 wsse: 51653.9041397
[array([ 18.41273115, 148.92459016, 11.07617648]),
array([ 18.6509157 , 126.61570248, 6.53298159])]
=====================
clusters: 3 wsse: 41942.6155675
[array([ 10.8177 , 128.0875 , 27.56189311]),
array([ 18.56358238, 149.54273128, 10.44243336]),
array([ 19.96411579, 128.40175439, 4.0452634 ])]
=====================
clusters: 4 wsse: 27931.9319376
[array([ 15.1584 , 116.85416667, 9.46983675]),
array([ 18.28769268, 150.4152439 , 10.02375824]),
array([ 12.80055484, 151.33548387, 28.78192362]),
array([ 21.52348525, 136.8 , 4.11794825])]
=====================
clusters: 5 wsse: 24669.9852226
[array([ 13.24812857, 153.46785714, 28.84986639]),
array([ 21.85741463, 140.95284553, 4.24549706]),
array([ 12.22992 , 117.18 , 17.74435366]),
array([ 17.52289254, 151.24701493, 11.11531333]),
array([ 18.84076364, 124.13090909, 4.18226563])]
=====================
clusters: 6 wsse: 20112.4224287
[array([ 11.56225263, 124.28947368, 22.40440807]),
array([ 21.85616629, 134.52696629, 3.41238669]),
array([ 11.26877143, 151.73571429, 34.54716717]),
array([ 16.11535135, 154.51486486, 14.96312391]),
array([ 19.8342 , 146.87205882, 7.40644953]),
array([ 16.26774545, 114.86969697, 5.01463697])]
=====================
clusters: 7 wsse: 17435.3423657
[array([ 21.32876471, 144.41666667, 5.3934587 ]),
array([ 9.238 , 139.31111111, 37.13987924]),
array([ 16.0111125 , 114.6375 , 5.17134437]),
array([ 12.062475 , 122.975 , 20.5382479]),
array([ 17.11121416, 151.65132743, 11.83406696]),
array([ 14.50422857, 159.02857143, 25.07988355]),
array([ 21.49 , 132.8 , 3.48144481])]
=====================
clusters: 8 wsse: 16078.0656302
[array([ 22.34222222, 144.7382716 , 4.35659797]),
array([ 14.05816364, 157.26818182, 27.6377888 ]),
array([ 10.57896 , 129.23333333, 28.20258724]),
array([ 21.20078919, 132.99054054, 3.81229138]),
array([ 17.8422 , 155.56481481, 10.10725364]),
array([ 12.6492 , 107.56666667, 14.6033539 ]),
array([ 16.46404138, 117.10344828, 4.61640735]),
array([ 16.64377778, 147.28395062, 12.5421726 ])]
=====================
clusters: 9 wsse: 14688.4978365
[array([ 21.75560597, 132.06567164, 2.95348401]),
array([ 16.64314286, 159.86190476, 11.4000495 ]),
array([ 15.52347 , 151.08 , 16.22124704]),
array([ 10.25112 , 128.23333333, 28.66353089]),
array([ 15.13058824, 114.17941176, 7.09277694]),
array([ 17.21630204, 141.22653061, 10.66913325]),
array([ 19.2331125 , 150.6421875 , 8.57764353]),
array([ 13.75155 , 158.36875 , 29.87031924]),
array([ 23.02791864, 144.21525424, 3.15515734])]
=====================
clusters: 10 wsse: 12686.6834652
[array([ 22.83398491, 136.11698113, 2.25672241]),
array([ 18.45165405, 129.16756757, 7.29148748]),
array([ 9.238 , 139.31111111, 37.13987924]),
array([ 14.34305455, 163.17272727, 28.23340588]),
array([ 17.403225 , 143.31875 , 10.68176622]),
array([ 10.94 , 114.21111111, 23.83249491]),
array([ 16.14888 , 114.83333333, 4.75670706]),
array([ 15.393 , 150.49722222, 17.95108856]),
array([ 22.58628197, 145.91639344, 3.92830588]),
array([ 17.82319437, 154.15492958, 9.90247663])]
=====================
clusters: 11 wsse: 12264.6880846
[array([ 15.49392632, 150.26578947, 17.76128292]),
array([ 15.81955714, 114.33571429, 5.08836354]),
array([ 22.8807 , 138.785 , 2.74174112]),
array([ 16.48877143, 163.32857143, 15.09638114]),
array([ 12.71232 , 157.37 , 33.12208065]),
array([ 10.90035 , 113.05 , 23.34386922]),
array([ 9.92847273, 131.94545455, 29.4181611 ]),
array([ 20.51113171, 128.96585366, 3.90980567]),
array([ 17.7985125 , 152.496875 , 9.63050466]),
array([ 17.48228936, 141.50212766, 10.58679503]),
array([ 22.47267273, 147.85 , 4.18217891])]
=====================
clusters: 12 wsse: 11202.7464003
[array([ 22.63210435, 147.57826087, 3.95245456]),
array([ 10.23325714, 124.77142857, 29.60598989]),
array([ 17.546475 , 143.30416667, 11.29879874]),
array([ 16.33896 , 163.14666667, 13.60581841]),
array([ 8.18424 , 142.88 , 41.44058217]),
array([ 15.23178 , 131.32 , 11.40703088]),
array([ 17.18361429, 151.975 , 11.36787419]),
array([ 22.73854426, 138.26229508, 2.91989145]),
array([ 11.2056 , 109.13333333, 19.39290411]),
array([ 14.5412 , 155.43888889, 26.59558906]),
array([ 22.2064 , 128.11111111, 2.10771663]),
array([ 15.88795714, 114.925 , 4.90173801])]
=====================
clusters: 13 wsse: 10312.657661
[array([ 18.560775 , 154.2625 , 8.62968799]),
array([ 8.0862 , 134.56666667, 36.95139746]),
array([ 14.6286 , 130.36875 , 12.24990262]),
array([ 22.5279 , 147.06666667, 3.91323292]),
array([ 23.66271429, 137.2047619 , 1.61120476]),
array([ 22.03416 , 127.76 , 2.22510478]),
array([ 14.6718 , 175.9 , 35.53503772]),
array([ 15.39189474, 153.91842105, 15.83886042]),
array([ 19.22241951, 139.92926829, 7.37384222]),
array([ 15.81331034, 114.63103448, 5.14771645]),
array([ 17.1364 , 146.15111111, 12.33139923]),
array([ 10.94 , 114.21111111, 23.83249491]),
array([ 13.713075 , 154.85 , 28.50306786])]
=====================
clusters: 14 wsse: 9079.07954898
[array([ 17.38536585, 142.74146341, 10.88346639]),
array([ 15.81189231, 116.03076923, 5.07582023]),
array([ 8.45055 , 138.5375 , 38.03433723]),
array([ 14.9319 , 102. , 8.3211653]),
array([ 22.71531429, 136.34285714, 2.4798319 ]),
array([ 20.07687857, 150.35178571, 7.47258486]),
array([ 13.1922 , 170.5 , 32.14681167]),
array([ 16.6968 , 159.44347826, 11.19629668]),
array([ 22.03416 , 127.76 , 2.22510478]),
array([ 23.24347826, 144.18478261, 2.98302143]),
array([ 15.23178 , 131.32 , 11.40703088]),
array([ 15.96889565, 150.80869565, 14.14488373]),
array([ 10.77615 , 115.925 , 24.81503051]),
array([ 14.8509 , 153.7375 , 24.90207169])]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment