Skip to content

Instantly share code, notes, and snippets.

@mjcreativeventures
Created February 15, 2016 05:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjcreativeventures/41cb2915acc8c6b0a1e2 to your computer and use it in GitHub Desktop.
Save mjcreativeventures/41cb2915acc8c6b0a1e2 to your computer and use it in GitHub Desktop.
Apply KDE to MRT station twitter data
import cPickle as pickle
import math
CUTOFF = 50 # ignore stations with less than 50 tweets
# aggregated station data stored in a pickled file
stationdata = pickle.load(open('station_tweets.p','rb'))
# apply cutoff and sort by station name
stations = np.sort([ key for key in stationdata.keys() if len(stationdata[key]['alltimes']) >= CUTOFF ])
hours = np.linspace(0,24,96) # plot 96 points, one for every 15 minutes of time
MAX_COLS=4
rows = int(math.ceil(1.0 * len(stations)/MAX_COLS))
fig, ax = plt.subplots(rows, MAX_COLS, sharey=True, figsize=(18, 5))
fig.subplots_adjust(top=7,wspace=0)
for (i,st) in enumerate(stations):
# alltimes key contains an array of tweet times for the station
pts = np.array(stationdata[st]['alltimes'])
# cross-validation
grid = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.4, 4.0, 50)}, cv=CUTOFF)
grid.fit(pts[:, None])
kde = grid.best_estimator_
pdf = np.exp(kde.score_samples(hours[:, None]))
rowidx, colidx = (int(math.floor(i / MAX_COLS)), i % MAX_COLS)
axsubplot = ax[rowidx][colidx]
axsubplot.plot(hours, pdf, linewidth=3, alpha=0.5, label='bw=%.2f' % kde.bandwidth)
axsubplot.hist(pts, 96, fc='gray', histtype='stepfilled', alpha=0.3, normed=True)
axsubplot.legend(loc='upper left')
axsubplot.set_title(st)
axsubplot.set_xlim(0,24)
axsubplot.set_ylim(0,.3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment