Last active
December 10, 2015 07:18
-
-
Save k-bx/4400242 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from collections import defaultdict | |
def get_row_key(row): | |
return str(row[4] + row[15]) | |
def main(): | |
# DEBUG = False | |
Y = 70 | |
SPLIT_SIZE = 1000000 | |
OUT_FILE = "output.csv" | |
def get_rows(f): | |
for line in f: | |
row = line.rstrip('\n').rstrip('\r').split(',') | |
if row[4] and row[15]: | |
yield row | |
def get_keys(rows): | |
for row in rows: | |
yield get_row_key(row) | |
def split(size, keys): | |
""" | |
split(2, [1,2,3,4,5]) == [[1,2],[3,4],[5]] | |
""" | |
group = [] | |
for key in keys: | |
group.append(key) | |
if len(group) == size: | |
rv = list(group) | |
yield rv | |
group = [] | |
if len(group): | |
rv = list(group) | |
yield rv | |
def get_y_keys_for_group(group, keys): | |
counts = defaultdict(int) | |
for key in keys: | |
if key in group: | |
counts[key] += 1 | |
return [x[0] for x in counts.iteritems() if x[1] > Y] | |
def get_rows_for_keys(keys, rows): | |
rv = [] | |
for row in rows: | |
if get_row_key(row) in keys: | |
rv.append(row) | |
return rv | |
def print_result(result): | |
print 'Overall result len is:', len(result) | |
print "\n".join([str(x) for x in result]) | |
# if DEBUG: | |
# from nose.tools import assert_equal | |
# assert_equal(list(split(2, [1, 2, 3, 4, 5])), [[1, 2], [3, 4], [5]]) | |
with open(OUT_FILE, "rb") as f: | |
groups = split(SPLIT_SIZE, get_keys(get_rows(f))) | |
y_keys = [] | |
for group in groups: | |
group_set = set(group) | |
with open(OUT_FILE, "rb") as f1: | |
y_keys.extend( | |
get_y_keys_for_group(group_set, get_keys(get_rows(f1)))) | |
f1.seek(0) | |
f.seek(0) | |
result = get_rows_for_keys(set(y_keys), get_rows(f)) | |
print_result(result) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment