Skip to content

Instantly share code, notes, and snippets.

@snopoke
Last active December 17, 2019 14:54
Show Gist options
  • Save snopoke/85af6f30e114ea1f8d8d410c33dc59f1 to your computer and use it in GitHub Desktop.
Save snopoke/85af6f30e114ea1f8d8d410c33dc59f1 to your computer and use it in GitHub Desktop.
import os
import csv
import shutil
import gzip
from datetime import datetime, timedelta
from collections import Counter
from elasticsearch.exceptions import ConnectionTimeout
from corehq.util.timezones.utils import parse_date
from corehq.apps.es import FormES
path = '/home/cchq/form_dates'
try:
shutil.rmtree(path)
except FileNotFoundError:
pass
os.makedirs(path)
gstart = datetime.utcnow()
start = datetime(2019, 10, 1)
end = datetime(2019, 11, 1)
tc = 0
day_summary = {}
dump_raw = False
while start < end:
day = start.date()
if dump_raw:
all_data_filename = f"{start.strftime('%Y-%m-%d')}.csv.gz"
all_data_headers = []
if not os.path.isfile(os.path.join(path, all_data_filename)):
all_data_headers = ['form_id', 'completed_on', 'received_on', 'days_diff']
if day not in day_summary:
day_summary[day] = Counter()
prev_day = day - timedelta(days=1)
prev_day_summary = day_summary.pop(prev_day, None)
if prev_day_summary:
summary_filename = f"{prev_day.strftime('%Y-%m-%d')}_summary.csv"
print(f' Writing summary {summary_filename}')
with open(os.path.join(path, summary_filename), 'w') as f:
writer = csv.writer(f)
writer.writerow(['days_diff', 'form_count'])
writer.writerows(sorted(prev_day_summary.items()))
count = 0
se = start + timedelta(hours=1)
rows = []
try:
forms = FormES().domain('icds-cas').completed(gte=start, lt=se).source(['form.meta.timeEnd', 'received_on']).scroll()
for form in forms:
count += 1
completed_on = form['form']['meta']['timeEnd']
received_on = form['received_on']
days_diff = parse_date(received_on).date() - parse_date(completed_on).date()
rows.append([form['_id'], completed_on, received_on, days_diff.days])
except ConnectionTimeout:
print(f' retrying batch: {start} to {se}')
continue
if dump_raw:
with gzip.open(os.path.join(path, all_data_filename), 'at') as f:
writer = csv.writer(f)
if all_data_headers:
writer.writerow(all_data_headers)
writer.writerows(rows)
day_summary[day].update([row[3] for row in rows])
tc += count
print(f'[{datetime.utcnow()}] Runtime: {datetime.utcnow() - gstart}, Progress: {start} to {se}: {count} ({tc})')
start = se
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment