import datetime | |
import gzip | |
import csv | |
from glob import glob | |
import concurrent.futures | |
cutoff = datetime.datetime.now() - datetime.timedelta(days=6 * 30) | |
def count(fn, i): | |
print(i, fn) | |
count = total = 0 | |
with gzip.open(fn, 'rt') as f: | |
reader = csv.reader(f) | |
for line in reader: | |
lastmodified = datetime.datetime.strptime( | |
line[3], | |
'%Y-%m-%dT%H:%M:%S.%fZ' | |
) | |
if lastmodified > cutoff: | |
count += 1 | |
total += 1 | |
return total, count | |
def run(): | |
total = recent = 0 | |
futures = [] | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
for i, fn in enumerate(glob('*.csv.gz')): | |
if len(fn) == 39: | |
futures.append(executor.submit(count, fn, i)) | |
for future in concurrent.futures.as_completed(futures): | |
t, c = future.result() | |
total += t | |
recent += c | |
print(total) | |
print(recent) | |
print('{:.1f}%'.format(100 * recent / total)) | |
run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment