Skip to content

Instantly share code, notes, and snippets.

@fulmicoton
Created December 11, 2013 12:29
Show Gist options
  • Save fulmicoton/7909562 to your computer and use it in GitHub Desktop.
Save fulmicoton/7909562 to your computer and use it in GitHub Desktop.
Parse function returns a generator of session object. It takes a generator of tuples as an input.
import itertools
from collections import defaultdict, OrderedDict
from math import log
def dcg(scores):
return sum( (2**score - 1) / log(i+2) for (i, score) in enumerate(scores) )
class Session(object):
__slots__ = ('sid', 'day', 'day', 'user', 'queries')
def __init__(self, sid, day, user, queries):
self.sid = sid
self.day = day
self.user = user
self.queries = queries
def print_debug(self,):
print "-----"
print "Session ", self.sid, "day=", self.day, "user=", self.user
for query in self.queries:
query.print_debug(indent=2)
def __cmp__(self, other):
return cmp( (self.user, self.sid,), (other.user, other.sid,) )
@staticmethod
def from_rows(session, rows):
meta = rows[0][1:]
assert meta[0] == "M"
day = int(meta[1])
user = int(meta[2])
queries = OrderedDict()
prev_click_action = None # just here to set up the dwell time
for action_row in rows[1:]:
action_row = action_row[1:]
if prev_click_action is not None:
prev_click_action.dwell_time = int(action_row[0]) - prev_click_action.time
if action_row[1] in {"Q","T"}:
prev_click_action = None
query = QueryEvent.parse(action_row)
queries[query.serp] = query
else:
assert action_row[1] == "C"
click = ClickEvent.parse(action_row)
queries[click.serp].clicks.append(click)
prev_click_action = click
return Session(session, day, user, queries.values())
def __repr__(self,):
return "Session<%i,user=%i>" % (self.sid, self.user)
class ClickEvent:
__slots__ = ('time', 'serp', 'url', 'dwell_time', )
def __init__(self, time, serp, url):
self.time = time
self.serp = serp
self.url = url
self.dwell_time = 1000
def print_debug(self, indent):
print " "*indent, "click", "time=", self.time, "url=", self.url, "dwell_time=", self.dwell_time
def satisfaction(self,):
if 50 <= self.dwell_time < 400:
return 1
elif 400 <= self.dwell_time:
return 2
return 0
@staticmethod
def parse(fields):
return ClickEvent(
int(fields[0]),
int(fields[2]),
int(fields[3])
)
class QueryEvent(object):
__slots__ = ('time', 'is_test', 'serp', 'query', 'terms', 'hits', 'clicks')
def __init__(self,
time,
is_test,
serp,
query,
terms,
hits):
self.time = time
self.is_test = is_test
self.serp = serp
self.query = query
self.terms = terms
self.hits = hits
self.clicks = []
assert len(self.hits)==10
def get_url_domain(self, qurl):
for (url, domain) in self.hits:
if url == qurl:
return domain
return None
def print_debug(self, indent):
print " "*indent, "- Query", "time=", self.time, "terms", ",".join(map(str, self.terms)), self.is_test
print " "*indent, " Hits", ",".join(map(str, self.hits))
for click in self.clicks:
click.print_debug(indent+2)
def urls(self,):
return zip(*self.hits)[0]
@staticmethod
def parse(fields):
return QueryEvent(
int(fields[0]),
(fields[1] == "T"),
int(fields[2]),
int(fields[3]),
map(int, fields[4].split(",")),
[
tuple(map(int, urldomain.split(",")))
for urldomain in fields[5:]
if "," in urldomain
]
)
def url_pertinence(self,):
# returns a dictionary url -> pertinence for the user
# as described in :
# http://www.kaggle.com/c/yandex-personalized-web-search-challenge/details/evaluation
rates = defaultdict(int)
for click in self.clicks:
rates[click.url] = max(click.satisfaction(), rates[click.url])
return rates
def parse(rows):
for session, rows in itertools.groupby(rows,key=lambda row:row[0]):
yield Session.from_rows(int(session), list(rows))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment