Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Categorize CommCare HQ domains needing couch -> sql migration of forms and cases
#!/usr/bin/env python
# coding: utf-8
Categorize CommCare HQ domains needing couch -> sql migration of forms and cases
This script requires a domains.csv file, which can be obtained by following
these steps:
1. Download Excel file containing domains using CouchDB for forms and cases
2. Open the file and remove the (last three) summary rows.
3. Export to CSV format.
The output is a set of text files listing domains in each category.
Output filenames are formatted as "{ORIGIN}_{category}.txt"
This script lives at
Usage: [options] DOMAINS_CSV ORIGIN
-h --help Show this help.
--output-dir=PATH Output directory [default: .]
--plot Show a pie graph of domain categories.
Requires extra setup:
python3 -m venv /path/to/virtualenv
# activate newly craeted virtualenv
pip install docopt pandas matplotlib
import sys
from os.path import join
import pandas as pd
from docopt import docopt
except ImportError:
print("Pre-setup required:\npip install docopt pandas")
def main():
args = docopt(__doc__)
data = pd.read_csv(args["DOMAINS_CSV"])
origin = args["ORIGIN"]
output_dir = args["--output-dir"]
categories = {}
all_domains = len(data)
total_domains = 0
print("{:<15} {:>6} {:>12} {}".format(
"migration time",
for name, query in [
("weird", get_weird), # mistakes?
("small", get_small(2000)),
("smallish", get_small(3500)),
("smallesque", get_small(5000)),
("fossilized", get_old(years=5)),
("inactive", get_old(years=1, limit=10000)),
("inactive_large", get_old(years=1, limit=100000)),
("super_large", get_large(720000)),
("large", get_large(60000)),
("moderate", lambda d: d),
categories[name] = query(data)
data = left_xor(data, categories[name])
if name == "weird":
forms = time_to_complete = "unknown"
forms = categories[name]['# Form Submissions'].apply(pd.to_numeric).sum()
time_to_complete = get_time_to_complete(forms)
total_domains += len(categories[name])
print("{:<15} {:>6} {:>12} {}".format(
path = join(output_dir, "%s_%s.txt" % (origin, name))
with open(path, "w", encoding="utf-8") as fh:
assert total_domains == all_domains, (total_domains, all_domains)
print("total {:>16}".format(total_domains))
if args["--plot"]:
def get_weird(data):
return data[data["# Active Mobile Workers"] == "Not yet calculated"]
def get_small(upper_limit):
def query(data):
return data[data['# Form Submissions'].apply(pd.to_numeric) < upper_limit]
return query
def get_large(lower_limit):
def query(data):
return data[data['# Form Submissions'].apply(pd.to_numeric) > lower_limit]
return query
def get_old(*, years, limit=None):
years_ago = NOW.replace(year=NOW.year - years)
def query(data):
where = data['Last Form Submission'].apply(pd.to_datetime) <= years_ago
if limit is not None:
where &= data['# Form Submissions'].apply(pd.to_numeric) < limit
return data[where]
return query
def left_xor(left, right):
merged = left.merge(right, indicator=True, how='outer')
merged = merged[merged['_merge'] == 'left_only']
return merged.drop('_merge', 1)
def get_time_to_complete(forms):
# Estimated migration throughput is about 24 forms/sec in the first
# phase. Other phases add extra time. Estimate low here. And always
# add more buffer time if/when sharing these numbers.
num = forms / 10 # 10 forms/sec overall
for divisor, name, next_div in [
(60, "minutes", 60),
(60, "hours", 24),
(24, "days", 7),
(7, "weeks", 4),
num = num / divisor
if num < next_div * 2:
return "%.1f %s" % (num, name)
def show_pie_graph(categories):
import matplotlib.pyplot as plt
sizes = []
labels = []
for name, data in categories.items():
labels.append(name + " (%d)" % len(data))
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, startangle=90)
if __name__ == "__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.