Last active
February 5, 2024 10:05
-
-
Save vivekteega/c12b9b63d9cd68892a42310ce0220815 to your computer and use it in GitHub Desktop.
Pulling data from ProPublica into an Excel file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import pandas as pd | |
import pdb | |
def fetch_org_data(ein): | |
org_data = requests.get(f"https://projects.propublica.org/nonprofits/api/v2/organizations/{ein}.json") | |
if org_data.status_code == 200: | |
org_data = org_data.json() | |
print(f"Processing {org_data['organization']['name']}") | |
org_df = pd.DataFrame() | |
for filing_data in org_data['filings_with_data']: | |
df = pd.DataFrame([filing_data]) | |
for key in org_data['organization'].keys(): | |
df[f"{key}"] = org_data['organization'][f"{key}"] | |
org_df = pd.concat([org_df, df]) | |
for filing_data in org_data['filings_without_data']: | |
df = pd.DataFrame([filing_data]) | |
for key in org_data['organization'].keys(): | |
df[f"{key}"] = org_data['organization'][f"{key}"] | |
org_df = pd.concat([org_df, df]) | |
return org_df | |
else: | |
print(f"Failed to fetch organizaion data. Status code: {org_data.status_code}") | |
def process_api_data(api_data): | |
organizations = api_data['organizations'] | |
propublica_df = pd.DataFrame() | |
for org in organizations: | |
org_df = fetch_org_data(org['ein']) | |
propublica_df = pd.concat([propublica_df, org_df]) | |
return propublica_df | |
def create_excel(search_term=''): | |
api_page = 0 | |
full_df = pd.DataFrame() | |
while True: | |
# Step 1: Fetch data from API | |
api_url = f'https://projects.propublica.org/nonprofits/api/v2/search.json?page={api_page}&p={search_term}' | |
response = requests.get(api_url) | |
# Check if the request was successful (status code 200) | |
if response.status_code == 200: | |
# Step 2: Convert API response to JSON | |
data = response.json() | |
print(f"\nPage {api_page}/{data['num_pages']}\n") | |
full_df = pd.concat([full_df, process_api_data(data)]) | |
else: | |
print(f"Failed to fetch data. Status code: {response.status_code}") | |
api_page = api_page+1 | |
if api_page == data['num_pages']: | |
break | |
# Using openpyxl | |
full_df.to_excel('output.xlsx', index=False) | |
if __name__=='__main__': | |
create_excel() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pandas | |
openpyxl |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment