Created
October 28, 2025 19:19
-
-
Save acbass49/5451920df961d91cf01b035137f70866 to your computer and use it in GitHub Desktop.
21 Leadership Demographics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from google import genai | |
| import httpx | |
| import io | |
| from helium import * | |
| import datetime | |
| import pandas as pd | |
| import numpy as np | |
| import time | |
| from selenium import webdriver | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from selenium.webdriver.chrome.service import Service as ChromeService | |
| from selenium.webdriver.chrome.options import Options as ChromeOptions | |
| import os | |
| import pyperclip | |
| import survey_tools as st | |
| from scipy.stats import norm | |
| import math | |
| def calculate_margin_of_error( | |
| sample_size: int, | |
| confidence_level: float = 0.95, | |
| sample_proportion: float = 0.5 | |
| ) -> float: | |
| if sample_size <= 0: | |
| raise ValueError("Sample size must be positive.") | |
| z_score = norm.ppf((1 + confidence_level) / 2) | |
| standard_error = math.sqrt( | |
| (sample_proportion * (1 - sample_proportion)) / sample_size | |
| ) | |
| margin_of_error = z_score * standard_error | |
| return margin_of_error | |
| sample_n = 623 | |
| calculate_margin_of_error( | |
| sample_size=sample_n, | |
| confidence_level=0.95 | |
| ) | |
| # Step 1 : get all links | |
| def get_links_style_1(url): | |
| options = ChromeOptions() | |
| options.add_argument("--headless=new") | |
| # This automatically downloads the correct driver and sets the path | |
| service = ChromeService(ChromeDriverManager().install()) | |
| driver = webdriver.Chrome(service=service, options=options) | |
| set_driver(driver) | |
| go_to(url) | |
| time.sleep(5) | |
| links = [] | |
| for item in find_all(S('a[target="_self"]')): | |
| links.append(item.web_element.get_attribute("href")) | |
| return links | |
| def get_links_style_2(url): | |
| options = ChromeOptions() | |
| options.add_argument("--headless=new") | |
| # This automatically downloads the correct driver and sets the path | |
| service = ChromeService(ChromeDriverManager().install()) | |
| driver = webdriver.Chrome(service=service, options=options) | |
| set_driver(driver) | |
| go_to(url) | |
| time.sleep(5) | |
| links = [] | |
| for item in find_all(S('a[class="sc-b68e04a7-5 cQaNDz"]')): | |
| links.append(item.web_element.get_attribute("href")) | |
| return links | |
| links = get_links_style_1("https://www.churchofjesuschrist.org/learn/first-presidency?lang=eng") + \ | |
| get_links_style_1("https://www.churchofjesuschrist.org/learn/quorum-of-the-twelve-apostles?lang=eng") + \ | |
| get_links_style_1("https://www.churchofjesuschrist.org/learn/presidency-of-the-seventy?lang=eng") + \ | |
| get_links_style_1("https://www.churchofjesuschrist.org/learn/presiding-bishopric?lang=eng") + \ | |
| get_links_style_1("https://www.churchofjesuschrist.org/learn/relief-society-general-presidency?lang=eng") + \ | |
| get_links_style_1("https://www.churchofjesuschrist.org/learn/young-men-general-presidency?lang=eng") + \ | |
| get_links_style_1("https://www.churchofjesuschrist.org/learn/young-women-general-presidency?lang=eng") + \ | |
| get_links_style_1("https://www.churchofjesuschrist.org/learn/primary-general-presidency?lang=eng") + \ | |
| get_links_style_1("https://www.churchofjesuschrist.org/learn/sunday-school-general-presidency?lang=eng") + \ | |
| get_links_style_2("https://www.churchofjesuschrist.org/learn/quorum-of-the-seventy?lang=eng") | |
| # Oaks link is weird so replacing it | |
| links = ['https://www.churchofjesuschrist.org/learn/dallin-h-oaks?lang=eng'] + links[1:] | |
| # removing a few duplicates | |
| links = list(set(links)) | |
| # Step 2: parse each link | |
| def parse_data(links): | |
| columns = [ | |
| 'Name', | |
| 'Calling', | |
| 'Gender', | |
| 'Birth city', | |
| 'Birth state', | |
| 'Birth country', | |
| 'Profession', | |
| 'Profession category', | |
| 'Bachelor Degree', | |
| 'Graduate Degree', | |
| 'Ivy or Ivy Plus', | |
| 'MBA', | |
| 'JD', | |
| 'PHD', | |
| 'Attended BYU', | |
| 'Attended University of Utah', | |
| 'Number of children' | |
| ] | |
| client = genai.Client() | |
| prompt = "I am trying to create a dataset based off of this webpage. Please identify each field each separated by a comma: Name, Latest calling title, Gender, Birth City, Birth State, Birth Country, Profession, Profession Category, bio indicates Has Bachelors Degree, bio indicates has Graduate Degree, bio indicates went to Ivy or Ivy Plus, bio indicates has an MBA, bio indicates has a JD, bio indicates has a PHD, bio indicates attended BYU, bio indicates attended University of Utah, Number of children. If one of the fields is unclear leave it blank. Do not return any more text than the data and the commas between the data. For Binary fields write True or False. Please put dates in ISO 8601 format. Please include only one profession - what seems like the their most recent non-church job. Please include only the most relevant professional category. So, only one professional category. For the Professional Categories, please choose from one of these options: Agriculture forestry fishing or hunting, Mining, Utilities, Construction, Manufacturing, Professional and business services, Educational services, Health care and social assistance, Leisure and hospitality, Other services, Wholesale trade, Retail trade, Transportation and warehousing, Information, Financial activities, Federal government, State and local government. For gender answer either Male or Female." | |
| for num,link in enumerate(links): | |
| print(f"starting on {link}. Progress: {round((num+1/len(links))*100)}% completed") | |
| waiting_for_data = True | |
| while waiting_for_data: | |
| doc_url_1 = link | |
| doc_data_1 = io.BytesIO(httpx.get(doc_url_1).content) | |
| webpage = client.files.upload( | |
| file=doc_data_1, | |
| config=dict(mime_type='text/html') | |
| ) | |
| response = client.models.generate_content( | |
| model="gemini-2.5-flash", | |
| contents=[prompt, webpage]) | |
| print(response.text) | |
| if (len(response.text)<75) or (len(response.text.split(',')) != len(columns)): | |
| time.sleep(6) | |
| print("Yielded empty or faulty data. Trying again.") | |
| continue | |
| response_dict = {} | |
| for idx,val in enumerate(response.text.split(',')): | |
| response_dict[columns[idx]] = val | |
| waiting_for_data = False | |
| #need to add because of rate limits | |
| time.sleep(3) | |
| file_exists = os.path.isfile("../data/leadership/leadership2025.csv") | |
| new_data = pd.DataFrame([response_dict]) | |
| current_d = datetime.datetime.now() | |
| new_data['date_scraped'] = datetime.date(current_d.year, current_d.month, current_d.day) | |
| if file_exists: | |
| data = pd.read_csv("../data/leadership/leadership2025.csv") | |
| data = pd.concat([data, new_data], ignore_index=True) | |
| data.to_csv("../data/leadership/leadership2025.csv", index = False) | |
| else: | |
| new_data.to_csv("../data/leadership/leadership2025.csv", index=False) | |
| return None | |
| parse_data(links) | |
| sheet_id = "1IJvbJwYMWwof4DrBVJoDDX26COp1__hG9dqHNJOdZSI" | |
| data = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv") | |
| country_region_map = { | |
| "United States": "North America", | |
| "USA": "North America", | |
| "Brazil": "South America", | |
| "Mexico": "North America", | |
| "Philippines": "Asia", | |
| "Argentina": "South America", | |
| "Uruguay": "South America", | |
| "Germany": "Europe", | |
| "France": "Europe", | |
| "Peru": "South America", | |
| "Chile": "South America", | |
| "Hong Kong": "Asia", | |
| "Nigeria": "Africa", | |
| "Democratic Republic of the Congo": "Africa", | |
| "England": "Europe", # Often grouped under "Western Europe" or "Northern Europe" (as part of the UK) | |
| "Japan": "Asia", | |
| "Portugal": "Europe", | |
| "Costa Rica": "Central America", | |
| "Taiwan": "Asia", | |
| "Tonga": "Oceania", | |
| "Puerto Rico": "Caribbean", # US Territory, often grouped with the Caribbean or North America | |
| "Fiji": "Oceania", | |
| "New Zealand": "Oceania", | |
| "South Korea": "Asia", | |
| "Italy": "Europe", | |
| "South Africa": "Africa", | |
| "Czech Republic": "Europe", | |
| "Sierra Leone": "Africa", | |
| "Korea": "Asia", # Assuming this refers to South Korea given the other entry | |
| "Australia": "Oceania", | |
| "Netherlands": "Europe", | |
| "Dominican Republic": "Caribbean", # Part of the Caribbean, which is often grouped with North America | |
| "Venezuela": "South America", | |
| "El Salvador": "Central America", | |
| "Zimbabwe": "Africa", | |
| "American Samoa": "Oceania", # US Territory, part of Polynesia | |
| "Botswana": "Africa", | |
| np.nan : np.nan | |
| } | |
| data['region'] = data['Birth country'].apply(lambda x:country_region_map[x]) | |
| data.region.value_counts(normalize=True) | |
| data[data['Birth country'].isin(['USA', "United States"])].shape[0]/data.shape[0] | |
| data[data['Bachelor Degree'].eq(1)].shape[0]/data.shape[0] | |
| data[data['Graduate Degree'].eq(1)].shape[0]/data.shape[0] | |
| data['Birth country'].value_counts() | |
| country_data = pd.read_csv("https://raw.githubusercontent.com/acbass49/scrape_lds_stats/refs/heads/main/data/country-2025-04-27.csv") | |
| state_data = pd.read_csv("https://raw.githubusercontent.com/acbass49/scrape_lds_stats/refs/heads/main/data/state-2025-04-27.csv") | |
| ces_data = pd.read_csv("../data/CES/CES24_Common.csv") | |
| country_region_map_full = { | |
| # North America, Central America, and the Caribbean | |
| 'United States': 'North America', | |
| 'Canada': 'North America', | |
| 'Mexico': 'North America', | |
| 'Antigua and Barbuda': 'Caribbean', | |
| 'Aruba': 'Caribbean', | |
| 'Bahamas': 'Caribbean', | |
| 'Barbados': 'Caribbean', | |
| 'Cayman Islands': 'Caribbean', | |
| 'Curacao': 'Caribbean', | |
| 'Dominica': 'Caribbean', | |
| 'Dominican Republic': 'Caribbean', | |
| 'Grenada': 'Caribbean', | |
| 'Guadeloupe': 'Caribbean', | |
| 'Haiti': 'Caribbean', | |
| 'Jamaica': 'Caribbean', | |
| 'Martinique': 'Caribbean', | |
| 'Puerto Rico': 'Caribbean', | |
| 'Saint Kitts and Nevis': 'Caribbean', | |
| 'Saint Lucia': 'Caribbean', | |
| 'Saint Vincent': 'Caribbean', | |
| 'Sint Maarten': 'Caribbean', | |
| 'Trinidad and Tobago': 'Caribbean', | |
| 'Virgin Islands': 'Caribbean', | |
| 'Belize': 'Central America', | |
| 'Costa Rica': 'Central America', | |
| 'El Salvador': 'Central America', | |
| 'Guatemala': 'Central America', | |
| 'Honduras': 'Central America', | |
| 'Nicaragua': 'Central America', | |
| 'Panama': 'Central America', | |
| # South America | |
| 'Argentina': 'South America', | |
| 'Bolivia': 'South America', | |
| 'Brazil': 'South America', | |
| 'Chile': 'South America', | |
| 'Colombia': 'South America', | |
| 'Ecuador': 'South America', | |
| 'French Guiana': 'South America', | |
| 'Guyana': 'South America', | |
| 'Paraguay': 'South America', | |
| 'Peru': 'South America', | |
| 'Suriname': 'South America', | |
| 'Uruguay': 'South America', | |
| 'Venezuela': 'South America', | |
| # Europe | |
| 'Albania': 'Europe', | |
| 'Andorra': 'Europe', | |
| 'Armenia': 'Europe', # Transcontinental, often grouped with Europe | |
| 'Austria': 'Europe', | |
| 'Belgium': 'Europe', | |
| 'Bosnia and Herzegovina': 'Europe', | |
| 'Bulgaria': 'Europe', | |
| 'Croatia': 'Europe', | |
| 'Cyprus': 'Europe', # Geographically in Asia, but politically/culturally in Europe | |
| 'Czech Republic': 'Europe', | |
| 'Denmark': 'Europe', | |
| 'Estonia': 'Europe', | |
| 'Finland': 'Europe', | |
| 'France': 'Europe', | |
| 'Georgia': 'Europe', # Transcontinental, often grouped with Europe | |
| 'Germany': 'Europe', | |
| 'Greece': 'Europe', | |
| 'Guernsey': 'Europe', | |
| 'Hungary': 'Europe', | |
| 'Iceland': 'Europe', | |
| 'Ireland': 'Europe', | |
| 'Isle of Man': 'Europe', | |
| 'Italy': 'Europe', | |
| 'Jersey': 'Europe', | |
| 'Latvia': 'Europe', | |
| 'Lithuania': 'Europe', | |
| 'Luxembourg': 'Europe', | |
| 'Malta': 'Europe', | |
| 'Moldova': 'Europe', | |
| 'Montenegro': 'Europe', | |
| 'Netherlands': 'Europe', | |
| 'Norway': 'Europe', | |
| 'Poland': 'Europe', | |
| 'Portugal': 'Europe', | |
| 'Romania': 'Europe', | |
| 'Russia': 'Europe', # European portion is most populated, often grouped as Europe or Eurasia | |
| 'Serbia': 'Europe', | |
| 'Slovakia': 'Europe', | |
| 'Slovenia': 'Europe', | |
| 'Spain': 'Europe', | |
| 'Sweden': 'Europe', | |
| 'Switzerland': 'Europe', | |
| 'Turkey': 'Europe', # Transcontinental, often grouped with Europe/Middle East | |
| 'Ukraine': 'Europe', | |
| 'United Kingdom': 'Europe', | |
| # Asia | |
| 'Bahrain': 'Asia', | |
| 'Cambodia': 'Asia', | |
| 'China': 'Asia', | |
| 'Hong Kong': 'Asia', | |
| 'India': 'Asia', | |
| 'Indonesia': 'Asia', | |
| 'Israel': 'Asia', # Often Middle East, which is part of Asia | |
| 'Japan': 'Asia', | |
| 'Kazakhstan': 'Asia', # Transcontinental, Central Asia | |
| 'Kuwait': 'Asia', | |
| 'Macau': 'Asia', | |
| 'Malaysia': 'Asia', | |
| 'Mongolia': 'Asia', | |
| 'Philippines': 'Asia', | |
| 'Singapore': 'Asia', | |
| 'South Korea': 'Asia', | |
| 'Sri Lanka': 'Asia', | |
| 'Taiwan': 'Asia', | |
| 'Thailand': 'Asia', | |
| 'United Arab Emirates': 'Asia', | |
| # Oceania | |
| 'American Samoa': 'Oceania', | |
| 'Australia': 'Oceania', | |
| 'Cook Islands': 'Oceania', | |
| 'Fiji': 'Oceania', | |
| 'French Polynesia': 'Oceania', | |
| 'Guam': 'Oceania', | |
| 'Kiribati': 'Oceania', | |
| 'Marshall Islands': 'Oceania', | |
| 'Micronesia': 'Oceania', | |
| 'Nauru': 'Oceania', | |
| 'New Caledonia': 'Oceania', | |
| 'New Zealand': 'Oceania', | |
| 'Niue': 'Oceania', | |
| 'Northern Mariana Islands': 'Oceania', | |
| 'Palau': 'Oceania', | |
| 'Papua New Guinea': 'Oceania', | |
| 'Samoa': 'Oceania', | |
| 'Solomon Islands': 'Oceania', | |
| 'Tonga': 'Oceania', | |
| 'Tuvalu': 'Oceania', | |
| 'Vanuatu': 'Oceania', | |
| # Africa | |
| 'Angola': 'Africa', | |
| 'Benin': 'Africa', | |
| 'Botswana': 'Africa', | |
| 'Burundi': 'Africa', | |
| 'Cameroon': 'Africa', | |
| 'Cape Verde': 'Africa', | |
| 'Central African Republic': 'Africa', | |
| "Cote d'Ivoire": 'Africa', | |
| 'Democratic Republic of the Congo': 'Africa', | |
| 'Ethiopia': 'Africa', | |
| 'Ghana': 'Africa', | |
| 'Kenya': 'Africa', | |
| 'Lesotho': 'Africa', | |
| 'Liberia': 'Africa', | |
| 'Madagascar': 'Africa', | |
| 'Malawi': 'Africa', | |
| 'Mauritius': 'Africa', | |
| 'Mozambique': 'Africa', | |
| 'Namibia': 'Africa', | |
| 'Nigeria': 'Africa', | |
| 'Republic of the Congo': 'Africa', | |
| 'Reunion': 'Africa', | |
| 'Rwanda': 'Africa', | |
| 'Sierra Leone': 'Africa', | |
| 'South Africa': 'Africa', | |
| 'Swaziland': 'Africa', # Now officially Eswatini, but still mapped to Africa | |
| 'Tanzania': 'Africa', | |
| 'Togo': 'Africa', | |
| 'Uganda': 'Africa', | |
| 'Zambia': 'Africa', | |
| 'Zimbabwe': 'Africa' | |
| } | |
| country_data['region'] = country_data['Name'].apply(lambda x:country_region_map_full[x]) | |
| data["Birth country"] = data["Birth country"].replace({"United States":"USA"}) | |
| # Figure 1 | |
| data.region.value_counts(normalize=True) | |
| country_data \ | |
| .groupby("region") \ | |
| .sum()['TotalChurchMembership'] \ | |
| .reset_index() \ | |
| .assign(prop = lambda x:x['TotalChurchMembership']/x['TotalChurchMembership'].sum()) | |
| # mexico quick look | |
| country_data[country_data.Name == "Mexico"].TotalChurchMembership.iloc[0]/country_data.TotalChurchMembership.sum() | |
| data['Birth country'].value_counts(normalize=True) | |
| # Utah vs. US quick look | |
| state_data[state_data.Name == "Utah"].TotalChurchMembership.iloc[0]/country_data[country_data.Name == "United States"].TotalChurchMembership.iloc[0] | |
| # 32% of members are in Utah | |
| data[data['Birth state'] == "Utah"].shape[0]/data[data['Birth country'] == "USA"].shape[0] | |
| # 51% of leaders are from Utah | |
| # Figure 2 | |
| # Career choice | |
| st.tabs(ces_data.query("religpew == 3"), "industry", wts="commonweight", display="column") | |
| ces_data['industry_rc'] = st.recode(ces_data, "industry", "10=NaN") | |
| to_clip = st.tabs(ces_data.query("religpew == 3"), "industry_rc", wts="commonweight", display="column").to_list() | |
| to_clip = list(map(str, to_clip)) | |
| pyperclip.copy('\n'.join(to_clip)) | |
| to_clip = st.tabs(data, "Profession category", display="column").to_list() | |
| to_clip = list(map(str, to_clip)) | |
| pyperclip.copy('\n'.join(to_clip)) | |
| #figure 3 | |
| # education level | |
| to_clip = st.tabs(ces_data.query("religpew == 3"), "educ", wts="commonweight", display="column").to_list() | |
| to_clip = list(map(str, to_clip)) | |
| data['Bachelor Degree'].sum()/data.shape[0] | |
| data['Graduate Degree'].sum()/data.shape[0] | |
| pyperclip.copy('\n'.join(to_clip)) | |
| st.tabs(ces_data.query("religpew == 3"), "educ", wts="commonweight", display="column") | |
| st.tabs(ces_data, "educ", wts="commonweight", display="column") | |
| #figure 4 | |
| data[data.Gender == "Male"].shape[0]/data.shape[0] | |
| # 94% Male | |
| data[data.White == "Yes"].shape[0]/data.shape[0] | |
| # 62% White | |
| #figure 5 | |
| data[data.MBA].shape[0]/data.shape[0] #38% | |
| data[data.JD].shape[0]/data.shape[0] #15% | |
| data[data.PHD].shape[0]/data.shape[0] #7% | |
| data[((data.MBA + data.JD + data.PHD)==0)&data['Graduate Degree']].shape[0]/data.shape[0] | |
| #16% | |
| #figure 6 | |
| data[data["Attended BYU"]].shape[0]/data.shape[0] #53% | |
| data[data["Attended University of Utah"]].shape[0]/data.shape[0] #16% | |
| #figure 7 | |
| #Q15? 8/15 53% | |
| data[data["Ivy or Ivy Plus"]].shape[0]/data.shape[0] #18% | |
| #there are 14 ivy or ivy plus schools | |
| # Less than 1% of US college students attend ivy league schools and only ~34% have attended | |
| #college. This means a fractiion of a percent of the US has attended ivy leagues | |
| #not sure what the LDS number is, but it is likely quite small and perhaps similar and definitly not 18 or 53%. | |
| #figure 8 | |
| tbl1 = data.region.value_counts(normalize=True).reset_index().rename(columns = {"index":"region", "region":"leadership_ratio"}) | |
| tbl2 = country_data \ | |
| .groupby("region") \ | |
| .sum()['TotalChurchMembership'] \ | |
| .reset_index() \ | |
| .assign(prop = lambda x:x['TotalChurchMembership']/x['TotalChurchMembership'].sum()) | |
| final = country_data \ | |
| .assign(MpC = lambda x:x['TotalChurchMembership']/x['Congregations']) \ | |
| .groupby("region") \ | |
| .mean()['MpC'] \ | |
| .reset_index() \ | |
| .merge(tbl1, on = "region") \ | |
| .merge(tbl2, on = "region") \ | |
| .assign(leadership_ratio = lambda x:x["leadership_ratio"] - x["prop"]) \ | |
| .assign(leadership_ratio = lambda x:round(x["leadership_ratio"]*100,2)) | |
| pyperclip.copy(final) | |
| final['MpC'].corr(final['leadership_ratio']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment