Skip to content

Instantly share code, notes, and snippets.

@k-bx
Last active December 10, 2015 07:18
Show Gist options
  • Save k-bx/4400242 to your computer and use it in GitHub Desktop.
Save k-bx/4400242 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from collections import defaultdict
def get_row_key(row):
return str(row[4] + row[15])
def main():
# DEBUG = False
Y = 70
SPLIT_SIZE = 1000000
OUT_FILE = "output.csv"
def get_rows(f):
for line in f:
row = line.rstrip('\n').rstrip('\r').split(',')
if row[4] and row[15]:
yield row
def get_keys(rows):
for row in rows:
yield get_row_key(row)
def split(size, keys):
"""
split(2, [1,2,3,4,5]) == [[1,2],[3,4],[5]]
"""
group = []
for key in keys:
group.append(key)
if len(group) == size:
rv = list(group)
yield rv
group = []
if len(group):
rv = list(group)
yield rv
def get_y_keys_for_group(group, keys):
counts = defaultdict(int)
for key in keys:
if key in group:
counts[key] += 1
return [x[0] for x in counts.iteritems() if x[1] > Y]
def get_rows_for_keys(keys, rows):
rv = []
for row in rows:
if get_row_key(row) in keys:
rv.append(row)
return rv
def print_result(result):
print 'Overall result len is:', len(result)
print "\n".join([str(x) for x in result])
# if DEBUG:
# from nose.tools import assert_equal
# assert_equal(list(split(2, [1, 2, 3, 4, 5])), [[1, 2], [3, 4], [5]])
with open(OUT_FILE, "rb") as f:
groups = split(SPLIT_SIZE, get_keys(get_rows(f)))
y_keys = []
for group in groups:
group_set = set(group)
with open(OUT_FILE, "rb") as f1:
y_keys.extend(
get_y_keys_for_group(group_set, get_keys(get_rows(f1))))
f1.seek(0)
f.seek(0)
result = get_rows_for_keys(set(y_keys), get_rows(f))
print_result(result)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment