Skip to content

Instantly share code, notes, and snippets.

@shahradj
Created September 1, 2017 18:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save shahradj/7870806b606d6f281336d6f69242431c to your computer and use it in GitHub Desktop.
Save shahradj/7870806b606d6f281336d6f69242431c to your computer and use it in GitHub Desktop.
Perform clustering on mouse tracking data
import pandas as pd
from scipy.cluster.hierarchy import fclusterdata
from pymongo import MongoClient
import numpy as np
from datetime import *
import json
def saveToRelational(jsonPacket):
"""
save the received json packet to a relational database
"""
def insertIntoDb(values,columns):
#TODO
return
df = packetToDF(jsonPacket['data'],jsonPacket['ip'])
df.loc[:,'timestamp'] = map(lambda dte:dte.iso_format(),df.index)
for row in df.iterrows():
insertIntoDb(row,df.columns)
def packetToDF(jsonPacket,ip):
"""
convert all mouse tracking data of a json packet into a pandas dataframe
"""
columns = ['x','y','w','h','isClick','target','url','type','text']
data = pd.DataFrame(
[[x.get(col) for col in columns] for i,x in enumerate(jsonPacket)],
[datetime.fromtimestamp(int(x['ts']) / 1000) for x in jsonPacket],columns).sort_index()
data.loc[:,'moves'] = range(len(data))
data.loc[:,'time'] = np.array((data.index - data.index[0]) / 1e6,int)
data.loc[:,'ip'] = [ip]*len(data)
data.loc[:,'text'] = [x.lower() if x != None else "" for x in data.text]
return data
class Clusterer():
def __init__(self,cluster = False):
"""
clustering the raw data sent from tracker.js
"""
with open('examplePackets.txt') as f:
self.jsonPackets = map(lambda line:json.loads(line),f.readlines())
self.data = self.userSessions()
if cluster:
self.sessionClusterMap = self.clusteringOfSessions(self.data)
def userSessions(self):
"""
combines all packets of mouse tracking data for a single ip into a sorted dataframe
MongoDB saves the json packets as sent by the js directly into the database
"""
res = [
packetToDF(
jsonPacket['data'],
jsonPacket['ip']
)
for jsonPacket in self.jsonPackets
]
if len(res) == 0:
return pd.DataFrame([])
elif len(res) == 1:
return res[0]
data = pd.concat(res)
sessions = data.groupby('ip').apply(self.separateSessions)
sessions.loc[:,'Date'] = [x.date().strftime('%Y-%m-%d') for x in sessions['timestamp']]
return sessions.set_index('timestamp')
def separateSessions(self,df):
"""
separates sessions of the same IP and identify them as IP(0), IP(1), IP(2) etc,
where a new session happens if a previous session has not had any movement for 1 hour, i.e. 3.6e6 microseconds
"""
ip = df['ip'].values[0]
n_sessions = [i + 1 for i,x in enumerate(df['time']-df['time'].shift()) if abs(x) >= 600000]
n_sessions = [0] + n_sessions + [len(df) + 1]
sessions = [
df[n_sessions[i]:n_sessions[i + 1]]
for i in range(len(n_sessions)-1)
]
for sess in sessions:
sess.loc[:,'time'] = sess['time'] - sess['time'].values[0]
sess.loc[:,'x_norm'] = sess['x'] / (sess['w'] + 1)
sess.loc[:,'y_norm'] = sess['y'] / (sess['h'] + 1)
sess.loc[:,'nClicks'] = [sess['isClick'][:i].sum() for i in range(len(sess))]
sess.loc[:,'timestamp'] = sess.index
for i,sess in enumerate(sessions):
sess.loc[:,'session'] = ip + '(' + str(i) + ')'
return pd.concat(sessions)
def clusteringOfSessions(self,data,clusteringVariables = ['x_norm','y_norm'],n_movements = 50):
"""
performs clustering across all the different sessions on the first 50 mouse movements
"""
assert 'session' in data.columns
sessionLengths = data.groupby('session').apply(len)
sessions=[session for session,length in sessionLengths.iteritems() if length >= n_movements]
MouseMovements = map(lambda session:data.set_index('session').loc[session,clusteringVariables][:n_movements].values.flatten().tolist(), sessions)
groups = fclusterdata(MouseMovements,1)
return dict(zip(sessions,map(lambda i:'behavioural group ' + str(i), groups)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment