Skip to content

Instantly share code, notes, and snippets.

@vivekteega
Last active February 5, 2024 10:05
Show Gist options
  • Save vivekteega/c12b9b63d9cd68892a42310ce0220815 to your computer and use it in GitHub Desktop.
Save vivekteega/c12b9b63d9cd68892a42310ce0220815 to your computer and use it in GitHub Desktop.
Pulling data from ProPublica into an Excel file
import requests
import pandas as pd
import pdb
def fetch_org_data(ein):
org_data = requests.get(f"https://projects.propublica.org/nonprofits/api/v2/organizations/{ein}.json")
if org_data.status_code == 200:
org_data = org_data.json()
print(f"Processing {org_data['organization']['name']}")
org_df = pd.DataFrame()
for filing_data in org_data['filings_with_data']:
df = pd.DataFrame([filing_data])
for key in org_data['organization'].keys():
df[f"{key}"] = org_data['organization'][f"{key}"]
org_df = pd.concat([org_df, df])
for filing_data in org_data['filings_without_data']:
df = pd.DataFrame([filing_data])
for key in org_data['organization'].keys():
df[f"{key}"] = org_data['organization'][f"{key}"]
org_df = pd.concat([org_df, df])
return org_df
else:
print(f"Failed to fetch organizaion data. Status code: {org_data.status_code}")
def process_api_data(api_data):
organizations = api_data['organizations']
propublica_df = pd.DataFrame()
for org in organizations:
org_df = fetch_org_data(org['ein'])
propublica_df = pd.concat([propublica_df, org_df])
return propublica_df
def create_excel(search_term=''):
api_page = 0
full_df = pd.DataFrame()
while True:
# Step 1: Fetch data from API
api_url = f'https://projects.propublica.org/nonprofits/api/v2/search.json?page={api_page}&p={search_term}'
response = requests.get(api_url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Step 2: Convert API response to JSON
data = response.json()
print(f"\nPage {api_page}/{data['num_pages']}\n")
full_df = pd.concat([full_df, process_api_data(data)])
else:
print(f"Failed to fetch data. Status code: {response.status_code}")
api_page = api_page+1
if api_page == data['num_pages']:
break
# Using openpyxl
full_df.to_excel('output.xlsx', index=False)
if __name__=='__main__':
create_excel()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment