Skip to content

Instantly share code, notes, and snippets.

@arnos-stuff
Created October 24, 2023 22:46
Show Gist options
  • Save arnos-stuff/6a3db7b3bcfd808d885da506714f876a to your computer and use it in GitHub Desktop.
Save arnos-stuff/6a3db7b3bcfd808d885da506714f876a to your computer and use it in GitHub Desktop.
python script to get world bank data indicators that might correlate with gdp into a single dataframe
import pandas as pd
import plotly.express as px
import wbdata as wb
import json
from typing import Dict, List
from pathlib import Path
# import metrics from JSON
metricsPath = Path("./metrics-gdp.json")
metrics = json.load(metricsPath.open())
colNames = {
'indicator.id' : 'indicator_id',
'indicator.value' : 'indicator',
'country.id' : 'country_code',
'country.value' : 'country',
'countryiso3code' : 'country_isocode',
}
colDrops = [
'unit',
'obs_status',
'decimal'
]
def fetchMetric(metricDef: Dict[str,str]) -> pd.DataFrame :
id = metricDef['id']
try:
data = pd.json_normalize(
wb.get_data(id)
)\
.rename(
columns=colNames
)\
.drop(
columns=colDrops
)
except RuntimeError as err:
print(metricDef)
raise err
return data
def colFormat(title: str) -> str:
charMap = [
('/', 'per'),
(' of ', ' '),
('-', '_'),
(' ', '_'),
('us$','usd'),
('$','usd'),
('(', ''),
(')' , ''),
(',', ''),
('%', 'percent')
]
title = title.lower()
for (char, repl) in charMap:
title = title.replace(char,repl)
return title
def splitDateInterval(df: pd.DataFrame) -> pd.DataFrame :
intervals = df[df.date.str.contains('-')].copy()
if not len(intervals):
return df
minYear = intervals.date.apply(lambda itv : int(itv.split('-')[0].strip()))
maxYear = intervals.date.apply(lambda itv : int(itv.split('-')[1].strip()))
midYear = ((minYear + maxYear) / 2).astype(int)
mindf = intervals.copy()
mindf.date = minYear
maxdf = intervals.copy()
maxdf.date = maxYear
middf = intervals.copy()
middf.date = midYear
return pd.concat(
[
df[~df.date.str.contains('-')].copy(),
mindf,
maxdf,
middf
],
axis=0
)
def preformat(df: pd.DataFrame, minYear: int | None = None) -> pd.DataFrame :
df = df.drop(columns=['country', 'country_code', 'indicator_id'])
name = colFormat(df.indicator.unique().tolist().pop())
df = df.rename(columns={'value' : name }).drop(columns=['indicator'])
df = splitDateInterval(df)
if minYear:
df = df.loc[df.date.astype(int) > minYear, :].copy()
return df
def fetchMetrics(metricList: List[Dict[str,str]], minYear: int | None = None) -> pd.DataFrame :
metricDfs = map(fetchMetric, metricList)
left = next(metricDfs)
remaining = metricDfs
for right in remaining:
left = pd.merge(left=left, right=preformat(right, minYear=minYear), how='inner', on=['date', 'country_isocode'])
return left
if __name__ == '__main__':
df = fetchMetrics(metrics, minYear=2010)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment