Created
July 13, 2014 10:09
-
-
Save huseyinyilmaz/5c634169935bc92638d4 to your computer and use it in GitHub Desktop.
most_common subsequence search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from collections import Counter | |
from itertools import izip | |
from itertools import islice | |
from operator import itemgetter | |
import datetime | |
import random | |
event_length = 1000000 | |
WIN_SIZE = 3 | |
def get_events(): | |
event_types = range(20) | |
return [random.choice(event_types) for i in range(event_length)] | |
def original(events, most_common): | |
counter = defaultdict(int) | |
for index in range(len(events) - WIN_SIZE + 1): | |
window = tuple(events[index: index + WIN_SIZE]) | |
counter[window] += 1 | |
return sorted(counter.items(), | |
key=itemgetter(1), | |
reverse=True)[:most_common] | |
def seq(events, most_common): | |
counter = Counter( | |
izip(*[islice(events, i, None) for i in range(WIN_SIZE)])) | |
return counter.most_common(most_common) | |
def start(): | |
most_common = 4 | |
events = get_events() | |
dt1 = datetime.datetime.now() | |
result = seq(events, most_common) | |
dt2 = datetime.datetime.now() | |
seq_time = dt2 - dt1 | |
print result | |
print 'result = %s' % result | |
print 'seq_time = %s' % seq_time | |
dt1 = datetime.datetime.now() | |
result = original(events, most_common) | |
dt2 = datetime.datetime.now() | |
org_time = dt2 - dt1 | |
print 'original_time = %s' % org_time | |
print 'result = %s' % result | |
if __name__ == '__main__': | |
start() | |
# [((8, 11, 19), 171), ((12, 7, 16), 165), ((1, 5, 15), 164), ((18, 6, 9), 164)] | |
# result = [((8, 11, 19), 171), ((12, 7, 16), 165), ((1, 5, 15), 164), ((18, 6, 9), 164)] | |
# seq_time = 0:00:00.604561 | |
# original_time = 0:00:00.771838 | |
# result = [((8, 11, 19), 171), ((12, 7, 16), 165), ((1, 5, 15), 164), ((18, 6, 9), 164)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Line # Mem usage Increment Line Contents | |
================================================ | |
35 8.047 MiB 0.000 MiB @profile | |
36 def start(): | |
37 8.070 MiB 0.023 MiB most_common = 4 | |
38 46.812 MiB 38.742 MiB events = get_events() | |
39 | |
40 46.812 MiB 0.000 MiB dt1 = datetime.datetime.now() | |
41 47.617 MiB 0.805 MiB result = seq(events, most_common) | |
42 47.625 MiB 0.008 MiB dt2 = datetime.datetime.now() | |
43 47.633 MiB 0.008 MiB seq_time = dt2 - dt1 | |
44 47.633 MiB 0.000 MiB print result | |
45 47.633 MiB 0.000 MiB print 'result = %s' % result | |
46 47.633 MiB 0.000 MiB print 'seq_time = %s' % seq_time | |
47 | |
48 47.633 MiB 0.000 MiB dt1 = datetime.datetime.now() | |
49 48.477 MiB 0.844 MiB result = original(events, most_common) | |
50 48.484 MiB 0.008 MiB dt2 = datetime.datetime.now() | |
51 48.484 MiB 0.000 MiB org_time = dt2 - dt1 | |
52 48.484 MiB 0.000 MiB print 'original_time = %s' % org_time | |
53 48.484 MiB 0.000 MiB print 'result = %s' % result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment