Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import datetime
import gzip
import csv
from glob import glob
import concurrent.futures
import ciso8601
cutoff = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc
) - datetime.timedelta(days=6 * 30)
def count(fn, i):
print(i, fn)
count = total = 0
with gzip.open(fn, 'rt') as f:
reader = csv.reader(f)
for line in reader:
lastmodified = ciso8601.parse_datetime(line[3])
if lastmodified > cutoff:
count += 1
total += 1
return total, count
def run():
total = recent = 0
futures = []
with concurrent.futures.ProcessPoolExecutor() as executor:
for i, fn in enumerate(glob('*.csv.gz')):
if len(fn) == 39:
futures.append(executor.submit(count, fn, i))
for future in concurrent.futures.as_completed(futures):
t, c = future.result()
total += t
recent += c
print(total)
print(recent)
print('{:.4f}%'.format(100 * recent / total))
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.