tgherzog/build.py

## build.py
import wbgapi as wb
import pandas as pd
import json

# Background: this is an attempt to make part of the data API more consistent and coherent.
# The API back-end has a mapping table that specifies the canonical database for each
# indicator when indicators appear in more than one database and the user doesn't explicitly
# specify a database, e.g.:
#   https://api.worldbank.org/v2/indicator/SP.POP.TOTL           (database is ambiguous)
#   https://api.worldbank.org/v2/indicator/SP.POP.TOTL?source=16 (database is clear)
# The problem with the existing approach is that 1) is is not transparent; 2) it is vulnerable
# to inconsistent rules, and 3) things behave unexpectedly as indicators are added, removed or
# moved to WDI archives.
# An improved approach implemnted circa June, 2021 is to effectively implement a database hierarchy
# so that responses would be consistent when the database is ambiguous. We requested that ITS
# implement a control file with the database hierarchy so that DECDG could reference and update
# it as necessary. I think in reality, they use the control file to update the original mapping
# table rather than reference it directly in code.

# This script shows how the current table is generated. The default hierarchy is simply the order
# in which databases (sources) are reported by the API, which happens to be sorted by database ID.
# Databases 2, 63, and 16 (WDI, Human Capital and HNP) are then bumped up in priority, and
# Africa Development Indicators (sunsetted) and WDI Archives are moved to the bottom.

# because the neither code nor the resulting table are directly referenced by the API (they
# are used to regenerate an internal table that we cannot directly access), you still need
# to do some QA after changing the database hierarchy.

sources = wb.source.Series()
df = pd.DataFrame({'name': wb.source.Series()})
df['ranking'] = df.index.astype('int64') * 10
df.loc['2', 'ranking'] = 1
df.loc['63', 'ranking'] = 2
df.loc['16', 'ranking'] = 3

bottom = max(df['ranking'].max()+1, 1000)
df.loc['11', 'ranking'] = bottom
df.loc['57', 'ranking'] = bottom + 1

rankings = [{'sourceId': str(k), 'ranking': str(row['ranking'])} for k,row in df.iterrows()]
print(json.dumps(rankings))

## rankinghierarchy.json
[{"sourceId": "1", "ranking": "10"}, {"sourceId": "2", "ranking": "1"}, {"sourceId": "3", "ranking": "30"}, {"sourceId": "5", "ranking": "50"}, {"sourceId": "6", "ranking": "60"}, {"sourceId": "11", "ranking": "1000"}, {"sourceId": "12", "ranking": "120"}, {"sourceId": "13", "ranking": "130"}, {"sourceId": "14", "ranking": "140"}, {"sourceId": "15", "ranking": "150"}, {"sourceId": "16", "ranking": "3"}, {"sourceId": "18", "ranking": "180"}, {"sourceId": "19", "ranking": "190"}, {"sourceId": "20", "ranking": "200"}, {"sourceId": "22", "ranking": "220"}, {"sourceId": "23", "ranking": "230"}, {"sourceId": "24", "ranking": "240"}, {"sourceId": "25", "ranking": "250"}, {"sourceId": "27", "ranking": "270"}, {"sourceId": "28", "ranking": "280"}, {"sourceId": "29", "ranking": "290"}, {"sourceId": "30", "ranking": "300"}, {"sourceId": "31", "ranking": "310"}, {"sourceId": "32", "ranking": "320"}, {"sourceId": "33", "ranking": "330"}, {"sourceId": "34", "ranking": "340"}, {"sourceId": "35", "ranking": "350"}, {"sourceId": "36", "ranking": "360"}, {"sourceId": "37", "ranking": "370"}, {"sourceId": "38", "ranking": "380"}, {"sourceId": "39", "ranking": "390"}, {"sourceId": "40", "ranking": "400"}, {"sourceId": "41", "ranking": "410"}, {"sourceId": "43", "ranking": "430"}, {"sourceId": "45", "ranking": "450"}, {"sourceId": "46", "ranking": "460"}, {"sourceId": "50", "ranking": "500"}, {"sourceId": "54", "ranking": "540"}, {"sourceId": "57", "ranking": "1001"}, {"sourceId": "58", "ranking": "580"}, {"sourceId": "59", "ranking": "590"}, {"sourceId": "60", "ranking": "600"}, {"sourceId": "61", "ranking": "610"}, {"sourceId": "62", "ranking": "620"}, {"sourceId": "63", "ranking": "2"}, {"sourceId": "64", "ranking": "640"}, {"sourceId": "65", "ranking": "650"}, {"sourceId": "66", "ranking": "660"}, {"sourceId": "67", "ranking": "670"}, {"sourceId": "68", "ranking": "680"}, {"sourceId": "69", "ranking": "690"}, {"sourceId": "70", "ranking": "700"}, {"sourceId": "71", "ranking": "710"}, {"sourceId": "72", "ranking": "720"}, {"sourceId": "73", "ranking": "730"}, {"sourceId": "75", "ranking": "750"}, {"sourceId": "76", "ranking": "760"}, {"sourceId": "77", "ranking": "770"}, {"sourceId": "78", "ranking": "780"}, {"sourceId": "79", "ranking": "790"}, {"sourceId": "80", "ranking": "800"}, {"sourceId": "81", "ranking": "810"}, {"sourceId": "82", "ranking": "820"}, {"sourceId": "83", "ranking": "830"}, {"sourceId": "84", "ranking": "840"}]
	import wbgapi as wb
	import pandas as pd
	import json

	# Background: this is an attempt to make part of the data API more consistent and coherent.
	# The API back-end has a mapping table that specifies the canonical database for each
	# indicator when indicators appear in more than one database and the user doesn't explicitly
	# specify a database, e.g.:
	# https://api.worldbank.org/v2/indicator/SP.POP.TOTL (database is ambiguous)
	# https://api.worldbank.org/v2/indicator/SP.POP.TOTL?source=16 (database is clear)
	# The problem with the existing approach is that 1) is is not transparent; 2) it is vulnerable
	# to inconsistent rules, and 3) things behave unexpectedly as indicators are added, removed or
	# moved to WDI archives.
	# An improved approach implemnted circa June, 2021 is to effectively implement a database hierarchy
	# so that responses would be consistent when the database is ambiguous. We requested that ITS
	# implement a control file with the database hierarchy so that DECDG could reference and update
	# it as necessary. I think in reality, they use the control file to update the original mapping
	# table rather than reference it directly in code.

	# This script shows how the current table is generated. The default hierarchy is simply the order
	# in which databases (sources) are reported by the API, which happens to be sorted by database ID.
	# Databases 2, 63, and 16 (WDI, Human Capital and HNP) are then bumped up in priority, and
	# Africa Development Indicators (sunsetted) and WDI Archives are moved to the bottom.

	# because the neither code nor the resulting table are directly referenced by the API (they
	# are used to regenerate an internal table that we cannot directly access), you still need
	# to do some QA after changing the database hierarchy.

	sources = wb.source.Series()
	df = pd.DataFrame({'name': wb.source.Series()})
	df['ranking'] = df.index.astype('int64') * 10
	df.loc['2', 'ranking'] = 1
	df.loc['63', 'ranking'] = 2
	df.loc['16', 'ranking'] = 3

	bottom = max(df['ranking'].max()+1, 1000)
	df.loc['11', 'ranking'] = bottom
	df.loc['57', 'ranking'] = bottom + 1

	rankings = [{'sourceId': str(k), 'ranking': str(row['ranking'])} for k,row in df.iterrows()]
	print(json.dumps(rankings))