Skip to content

Instantly share code, notes, and snippets.

@peterbe
Created May 2, 2018 16:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peterbe/f147fd093aef43304a5c7e0a89c1ea0a to your computer and use it in GitHub Desktop.
Save peterbe/f147fd093aef43304a5c7e0a89c1ea0a to your computer and use it in GitHub Desktop.
import datetime
import gzip
import csv
from glob import glob
import concurrent.futures
import ciso8601
cutoff = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc
) - datetime.timedelta(days=6 * 30)
def count(fn, i):
print(i, fn)
count = total = 0
with gzip.open(fn, 'rt') as f:
reader = csv.reader(f)
for line in reader:
lastmodified = ciso8601.parse_datetime(line[3])
if lastmodified > cutoff:
count += 1
total += 1
return total, count
def run():
total = recent = 0
futures = []
with concurrent.futures.ProcessPoolExecutor() as executor:
for i, fn in enumerate(glob('*.csv.gz')):
if len(fn) == 39:
futures.append(executor.submit(count, fn, i))
for future in concurrent.futures.as_completed(futures):
t, c = future.result()
total += t
recent += c
print(total)
print(recent)
print('{:.1f}%'.format(100 * recent / total))
run()
@peterbe
Copy link
Author

peterbe commented May 2, 2018

Using Python 3.6 and ciso8601==1.0.7
Output:

58819511
4233054
7.2%
python process-pool-ciso8601.py  617.11s user 3.54s system 686% cpu 1:30.39 total

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment