konstantinstadler/add_class_coco.py

## add_class_coco.py
""" Parse Global Health Data Exchange (GHDx) / Global Burden of Disease (GBD) numeric country codes for coco

This needs only to be done once, but might be a good guide for other inputs as well

Data sources:

    - GHDx: http://ghdx.healthdata.org/
    - Codebook with country codes: ghdx.healthdata.org/sites/default/files/ihme_query_tool/IHME_GBD_2019_CODEBOOK.zip

"""

import pandas as pd
import country_converter

data = pd.read_excel('./IHME_GBD_2019_ALL_LOCATIONS_HIERARCHIES_Y2020M10D15.XLSX', sheet_name='Sheet1', header=0, engine='openpyxl')
col_new_code = 'Location ID'
col_country_names = 'Location Name'

data.drop_duplicates(subset=[col_new_code, col_country_names], inplace=True)

coco = country_converter.CountryConverter(include_obsolete=True)
data.loc[:, 'converted'] = data.loc[:, col_country_names].apply(coco.convert, src='regex', to='name_short', not_found='not_found')

converted_with_duplicates = data[data.converted != 'not_found']

# Results
# Make sure to deal with the duplicates
found_duplicates = converted_with_duplicates[converted_with_duplicates.loc[:, 'converted'].duplicated(keep=False)]
# The country codes sorted in based on the data of the country converter
result_removed_duplicates = converted_with_duplicates.drop_duplicates(subset=['converted']).set_index('converted', drop=True).reindex(coco.data.name_short).fillna('')

# save results
xlsxwriter = pd.ExcelWriter('converted.xlsx', engine='openpyxl')
found_duplicates.to_excel(xlsxwriter, sheet_name='duplicates')
result_removed_duplicates.to_excel(xlsxwriter, sheet_name='sorted_results')
xlsxwriter.save()
	""" Parse Global Health Data Exchange (GHDx) / Global Burden of Disease (GBD) numeric country codes for coco

	This needs only to be done once, but might be a good guide for other inputs as well

	Data sources:

	- GHDx: http://ghdx.healthdata.org/
	- Codebook with country codes: ghdx.healthdata.org/sites/default/files/ihme_query_tool/IHME_GBD_2019_CODEBOOK.zip

	"""

	import pandas as pd
	import country_converter

	data = pd.read_excel('./IHME_GBD_2019_ALL_LOCATIONS_HIERARCHIES_Y2020M10D15.XLSX', sheet_name='Sheet1', header=0, engine='openpyxl')
	col_new_code = 'Location ID'
	col_country_names = 'Location Name'

	data.drop_duplicates(subset=[col_new_code, col_country_names], inplace=True)

	coco = country_converter.CountryConverter(include_obsolete=True)
	data.loc[:, 'converted'] = data.loc[:, col_country_names].apply(coco.convert, src='regex', to='name_short', not_found='not_found')

	converted_with_duplicates = data[data.converted != 'not_found']

	# Results
	# Make sure to deal with the duplicates
	found_duplicates = converted_with_duplicates[converted_with_duplicates.loc[:, 'converted'].duplicated(keep=False)]
	# The country codes sorted in based on the data of the country converter
	result_removed_duplicates = converted_with_duplicates.drop_duplicates(subset=['converted']).set_index('converted', drop=True).reindex(coco.data.name_short).fillna('')

	# save results
	xlsxwriter = pd.ExcelWriter('converted.xlsx', engine='openpyxl')
	found_duplicates.to_excel(xlsxwriter, sheet_name='duplicates')
	result_removed_duplicates.to_excel(xlsxwriter, sheet_name='sorted_results')
	xlsxwriter.save()