Skip to content

Instantly share code, notes, and snippets.

@huseyinyilmaz
Created July 13, 2014 10:09
Show Gist options
  • Save huseyinyilmaz/5c634169935bc92638d4 to your computer and use it in GitHub Desktop.
Save huseyinyilmaz/5c634169935bc92638d4 to your computer and use it in GitHub Desktop.
most_common subsequence search
from collections import defaultdict
from collections import Counter
from itertools import izip
from itertools import islice
from operator import itemgetter
import datetime
import random
event_length = 1000000
WIN_SIZE = 3
def get_events():
event_types = range(20)
return [random.choice(event_types) for i in range(event_length)]
def original(events, most_common):
counter = defaultdict(int)
for index in range(len(events) - WIN_SIZE + 1):
window = tuple(events[index: index + WIN_SIZE])
counter[window] += 1
return sorted(counter.items(),
key=itemgetter(1),
reverse=True)[:most_common]
def seq(events, most_common):
counter = Counter(
izip(*[islice(events, i, None) for i in range(WIN_SIZE)]))
return counter.most_common(most_common)
def start():
most_common = 4
events = get_events()
dt1 = datetime.datetime.now()
result = seq(events, most_common)
dt2 = datetime.datetime.now()
seq_time = dt2 - dt1
print result
print 'result = %s' % result
print 'seq_time = %s' % seq_time
dt1 = datetime.datetime.now()
result = original(events, most_common)
dt2 = datetime.datetime.now()
org_time = dt2 - dt1
print 'original_time = %s' % org_time
print 'result = %s' % result
if __name__ == '__main__':
start()
# [((8, 11, 19), 171), ((12, 7, 16), 165), ((1, 5, 15), 164), ((18, 6, 9), 164)]
# result = [((8, 11, 19), 171), ((12, 7, 16), 165), ((1, 5, 15), 164), ((18, 6, 9), 164)]
# seq_time = 0:00:00.604561
# original_time = 0:00:00.771838
# result = [((8, 11, 19), 171), ((12, 7, 16), 165), ((1, 5, 15), 164), ((18, 6, 9), 164)]
Line # Mem usage Increment Line Contents
================================================
35 8.047 MiB 0.000 MiB @profile
36 def start():
37 8.070 MiB 0.023 MiB most_common = 4
38 46.812 MiB 38.742 MiB events = get_events()
39
40 46.812 MiB 0.000 MiB dt1 = datetime.datetime.now()
41 47.617 MiB 0.805 MiB result = seq(events, most_common)
42 47.625 MiB 0.008 MiB dt2 = datetime.datetime.now()
43 47.633 MiB 0.008 MiB seq_time = dt2 - dt1
44 47.633 MiB 0.000 MiB print result
45 47.633 MiB 0.000 MiB print 'result = %s' % result
46 47.633 MiB 0.000 MiB print 'seq_time = %s' % seq_time
47
48 47.633 MiB 0.000 MiB dt1 = datetime.datetime.now()
49 48.477 MiB 0.844 MiB result = original(events, most_common)
50 48.484 MiB 0.008 MiB dt2 = datetime.datetime.now()
51 48.484 MiB 0.000 MiB org_time = dt2 - dt1
52 48.484 MiB 0.000 MiB print 'original_time = %s' % org_time
53 48.484 MiB 0.000 MiB print 'result = %s' % result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment