Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@peterbe
Created May 15, 2018 19:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peterbe/e0a60822c7fcd6ac3c374ae3fc3ab473 to your computer and use it in GitHub Desktop.
Save peterbe/e0a60822c7fcd6ac3c374ae3fc3ab473 to your computer and use it in GitHub Desktop.
import datetime
import gzip
import csv
from glob import glob
import concurrent.futures
cutoff = datetime.datetime.now() - datetime.timedelta(days=6 * 30)
def count(fn, i):
print(i, fn)
count = total = 0
with gzip.open(fn, 'rt') as f:
reader = csv.reader(f)
for line in reader:
lastmodified = datetime.datetime.strptime(
line[3],
'%Y-%m-%dT%H:%M:%S.%fZ'
)
if lastmodified > cutoff:
count += 1
total += 1
return total, count
def run():
total = recent = 0
futures = []
with concurrent.futures.ProcessPoolExecutor() as executor:
for i, fn in enumerate(glob('*.csv.gz')):
if len(fn) == 39:
futures.append(executor.submit(count, fn, i))
for future in concurrent.futures.as_completed(futures):
t, c = future.result()
total += t
recent += c
print(total)
print(recent)
print('{:.1f}%'.format(100 * recent / total))
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment