Skip to content

Instantly share code, notes, and snippets.

@acbass49
Created October 28, 2025 19:19
Show Gist options
  • Select an option

  • Save acbass49/5451920df961d91cf01b035137f70866 to your computer and use it in GitHub Desktop.

Select an option

Save acbass49/5451920df961d91cf01b035137f70866 to your computer and use it in GitHub Desktop.
21 Leadership Demographics
from google import genai
import httpx
import io
from helium import *
import datetime
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
import os
import pyperclip
import survey_tools as st
from scipy.stats import norm
import math
def calculate_margin_of_error(
sample_size: int,
confidence_level: float = 0.95,
sample_proportion: float = 0.5
) -> float:
if sample_size <= 0:
raise ValueError("Sample size must be positive.")
z_score = norm.ppf((1 + confidence_level) / 2)
standard_error = math.sqrt(
(sample_proportion * (1 - sample_proportion)) / sample_size
)
margin_of_error = z_score * standard_error
return margin_of_error
sample_n = 623
calculate_margin_of_error(
sample_size=sample_n,
confidence_level=0.95
)
# Step 1 : get all links
def get_links_style_1(url):
options = ChromeOptions()
options.add_argument("--headless=new")
# This automatically downloads the correct driver and sets the path
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
set_driver(driver)
go_to(url)
time.sleep(5)
links = []
for item in find_all(S('a[target="_self"]')):
links.append(item.web_element.get_attribute("href"))
return links
def get_links_style_2(url):
options = ChromeOptions()
options.add_argument("--headless=new")
# This automatically downloads the correct driver and sets the path
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
set_driver(driver)
go_to(url)
time.sleep(5)
links = []
for item in find_all(S('a[class="sc-b68e04a7-5 cQaNDz"]')):
links.append(item.web_element.get_attribute("href"))
return links
links = get_links_style_1("https://www.churchofjesuschrist.org/learn/first-presidency?lang=eng") + \
get_links_style_1("https://www.churchofjesuschrist.org/learn/quorum-of-the-twelve-apostles?lang=eng") + \
get_links_style_1("https://www.churchofjesuschrist.org/learn/presidency-of-the-seventy?lang=eng") + \
get_links_style_1("https://www.churchofjesuschrist.org/learn/presiding-bishopric?lang=eng") + \
get_links_style_1("https://www.churchofjesuschrist.org/learn/relief-society-general-presidency?lang=eng") + \
get_links_style_1("https://www.churchofjesuschrist.org/learn/young-men-general-presidency?lang=eng") + \
get_links_style_1("https://www.churchofjesuschrist.org/learn/young-women-general-presidency?lang=eng") + \
get_links_style_1("https://www.churchofjesuschrist.org/learn/primary-general-presidency?lang=eng") + \
get_links_style_1("https://www.churchofjesuschrist.org/learn/sunday-school-general-presidency?lang=eng") + \
get_links_style_2("https://www.churchofjesuschrist.org/learn/quorum-of-the-seventy?lang=eng")
# Oaks link is weird so replacing it
links = ['https://www.churchofjesuschrist.org/learn/dallin-h-oaks?lang=eng'] + links[1:]
# removing a few duplicates
links = list(set(links))
# Step 2: parse each link
def parse_data(links):
columns = [
'Name',
'Calling',
'Gender',
'Birth city',
'Birth state',
'Birth country',
'Profession',
'Profession category',
'Bachelor Degree',
'Graduate Degree',
'Ivy or Ivy Plus',
'MBA',
'JD',
'PHD',
'Attended BYU',
'Attended University of Utah',
'Number of children'
]
client = genai.Client()
prompt = "I am trying to create a dataset based off of this webpage. Please identify each field each separated by a comma: Name, Latest calling title, Gender, Birth City, Birth State, Birth Country, Profession, Profession Category, bio indicates Has Bachelors Degree, bio indicates has Graduate Degree, bio indicates went to Ivy or Ivy Plus, bio indicates has an MBA, bio indicates has a JD, bio indicates has a PHD, bio indicates attended BYU, bio indicates attended University of Utah, Number of children. If one of the fields is unclear leave it blank. Do not return any more text than the data and the commas between the data. For Binary fields write True or False. Please put dates in ISO 8601 format. Please include only one profession - what seems like the their most recent non-church job. Please include only the most relevant professional category. So, only one professional category. For the Professional Categories, please choose from one of these options: Agriculture forestry fishing or hunting, Mining, Utilities, Construction, Manufacturing, Professional and business services, Educational services, Health care and social assistance, Leisure and hospitality, Other services, Wholesale trade, Retail trade, Transportation and warehousing, Information, Financial activities, Federal government, State and local government. For gender answer either Male or Female."
for num,link in enumerate(links):
print(f"starting on {link}. Progress: {round((num+1/len(links))*100)}% completed")
waiting_for_data = True
while waiting_for_data:
doc_url_1 = link
doc_data_1 = io.BytesIO(httpx.get(doc_url_1).content)
webpage = client.files.upload(
file=doc_data_1,
config=dict(mime_type='text/html')
)
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=[prompt, webpage])
print(response.text)
if (len(response.text)<75) or (len(response.text.split(',')) != len(columns)):
time.sleep(6)
print("Yielded empty or faulty data. Trying again.")
continue
response_dict = {}
for idx,val in enumerate(response.text.split(',')):
response_dict[columns[idx]] = val
waiting_for_data = False
#need to add because of rate limits
time.sleep(3)
file_exists = os.path.isfile("../data/leadership/leadership2025.csv")
new_data = pd.DataFrame([response_dict])
current_d = datetime.datetime.now()
new_data['date_scraped'] = datetime.date(current_d.year, current_d.month, current_d.day)
if file_exists:
data = pd.read_csv("../data/leadership/leadership2025.csv")
data = pd.concat([data, new_data], ignore_index=True)
data.to_csv("../data/leadership/leadership2025.csv", index = False)
else:
new_data.to_csv("../data/leadership/leadership2025.csv", index=False)
return None
parse_data(links)
sheet_id = "1IJvbJwYMWwof4DrBVJoDDX26COp1__hG9dqHNJOdZSI"
data = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv")
country_region_map = {
"United States": "North America",
"USA": "North America",
"Brazil": "South America",
"Mexico": "North America",
"Philippines": "Asia",
"Argentina": "South America",
"Uruguay": "South America",
"Germany": "Europe",
"France": "Europe",
"Peru": "South America",
"Chile": "South America",
"Hong Kong": "Asia",
"Nigeria": "Africa",
"Democratic Republic of the Congo": "Africa",
"England": "Europe", # Often grouped under "Western Europe" or "Northern Europe" (as part of the UK)
"Japan": "Asia",
"Portugal": "Europe",
"Costa Rica": "Central America",
"Taiwan": "Asia",
"Tonga": "Oceania",
"Puerto Rico": "Caribbean", # US Territory, often grouped with the Caribbean or North America
"Fiji": "Oceania",
"New Zealand": "Oceania",
"South Korea": "Asia",
"Italy": "Europe",
"South Africa": "Africa",
"Czech Republic": "Europe",
"Sierra Leone": "Africa",
"Korea": "Asia", # Assuming this refers to South Korea given the other entry
"Australia": "Oceania",
"Netherlands": "Europe",
"Dominican Republic": "Caribbean", # Part of the Caribbean, which is often grouped with North America
"Venezuela": "South America",
"El Salvador": "Central America",
"Zimbabwe": "Africa",
"American Samoa": "Oceania", # US Territory, part of Polynesia
"Botswana": "Africa",
np.nan : np.nan
}
data['region'] = data['Birth country'].apply(lambda x:country_region_map[x])
data.region.value_counts(normalize=True)
data[data['Birth country'].isin(['USA', "United States"])].shape[0]/data.shape[0]
data[data['Bachelor Degree'].eq(1)].shape[0]/data.shape[0]
data[data['Graduate Degree'].eq(1)].shape[0]/data.shape[0]
data['Birth country'].value_counts()
country_data = pd.read_csv("https://raw.githubusercontent.com/acbass49/scrape_lds_stats/refs/heads/main/data/country-2025-04-27.csv")
state_data = pd.read_csv("https://raw.githubusercontent.com/acbass49/scrape_lds_stats/refs/heads/main/data/state-2025-04-27.csv")
ces_data = pd.read_csv("../data/CES/CES24_Common.csv")
country_region_map_full = {
# North America, Central America, and the Caribbean
'United States': 'North America',
'Canada': 'North America',
'Mexico': 'North America',
'Antigua and Barbuda': 'Caribbean',
'Aruba': 'Caribbean',
'Bahamas': 'Caribbean',
'Barbados': 'Caribbean',
'Cayman Islands': 'Caribbean',
'Curacao': 'Caribbean',
'Dominica': 'Caribbean',
'Dominican Republic': 'Caribbean',
'Grenada': 'Caribbean',
'Guadeloupe': 'Caribbean',
'Haiti': 'Caribbean',
'Jamaica': 'Caribbean',
'Martinique': 'Caribbean',
'Puerto Rico': 'Caribbean',
'Saint Kitts and Nevis': 'Caribbean',
'Saint Lucia': 'Caribbean',
'Saint Vincent': 'Caribbean',
'Sint Maarten': 'Caribbean',
'Trinidad and Tobago': 'Caribbean',
'Virgin Islands': 'Caribbean',
'Belize': 'Central America',
'Costa Rica': 'Central America',
'El Salvador': 'Central America',
'Guatemala': 'Central America',
'Honduras': 'Central America',
'Nicaragua': 'Central America',
'Panama': 'Central America',
# South America
'Argentina': 'South America',
'Bolivia': 'South America',
'Brazil': 'South America',
'Chile': 'South America',
'Colombia': 'South America',
'Ecuador': 'South America',
'French Guiana': 'South America',
'Guyana': 'South America',
'Paraguay': 'South America',
'Peru': 'South America',
'Suriname': 'South America',
'Uruguay': 'South America',
'Venezuela': 'South America',
# Europe
'Albania': 'Europe',
'Andorra': 'Europe',
'Armenia': 'Europe', # Transcontinental, often grouped with Europe
'Austria': 'Europe',
'Belgium': 'Europe',
'Bosnia and Herzegovina': 'Europe',
'Bulgaria': 'Europe',
'Croatia': 'Europe',
'Cyprus': 'Europe', # Geographically in Asia, but politically/culturally in Europe
'Czech Republic': 'Europe',
'Denmark': 'Europe',
'Estonia': 'Europe',
'Finland': 'Europe',
'France': 'Europe',
'Georgia': 'Europe', # Transcontinental, often grouped with Europe
'Germany': 'Europe',
'Greece': 'Europe',
'Guernsey': 'Europe',
'Hungary': 'Europe',
'Iceland': 'Europe',
'Ireland': 'Europe',
'Isle of Man': 'Europe',
'Italy': 'Europe',
'Jersey': 'Europe',
'Latvia': 'Europe',
'Lithuania': 'Europe',
'Luxembourg': 'Europe',
'Malta': 'Europe',
'Moldova': 'Europe',
'Montenegro': 'Europe',
'Netherlands': 'Europe',
'Norway': 'Europe',
'Poland': 'Europe',
'Portugal': 'Europe',
'Romania': 'Europe',
'Russia': 'Europe', # European portion is most populated, often grouped as Europe or Eurasia
'Serbia': 'Europe',
'Slovakia': 'Europe',
'Slovenia': 'Europe',
'Spain': 'Europe',
'Sweden': 'Europe',
'Switzerland': 'Europe',
'Turkey': 'Europe', # Transcontinental, often grouped with Europe/Middle East
'Ukraine': 'Europe',
'United Kingdom': 'Europe',
# Asia
'Bahrain': 'Asia',
'Cambodia': 'Asia',
'China': 'Asia',
'Hong Kong': 'Asia',
'India': 'Asia',
'Indonesia': 'Asia',
'Israel': 'Asia', # Often Middle East, which is part of Asia
'Japan': 'Asia',
'Kazakhstan': 'Asia', # Transcontinental, Central Asia
'Kuwait': 'Asia',
'Macau': 'Asia',
'Malaysia': 'Asia',
'Mongolia': 'Asia',
'Philippines': 'Asia',
'Singapore': 'Asia',
'South Korea': 'Asia',
'Sri Lanka': 'Asia',
'Taiwan': 'Asia',
'Thailand': 'Asia',
'United Arab Emirates': 'Asia',
# Oceania
'American Samoa': 'Oceania',
'Australia': 'Oceania',
'Cook Islands': 'Oceania',
'Fiji': 'Oceania',
'French Polynesia': 'Oceania',
'Guam': 'Oceania',
'Kiribati': 'Oceania',
'Marshall Islands': 'Oceania',
'Micronesia': 'Oceania',
'Nauru': 'Oceania',
'New Caledonia': 'Oceania',
'New Zealand': 'Oceania',
'Niue': 'Oceania',
'Northern Mariana Islands': 'Oceania',
'Palau': 'Oceania',
'Papua New Guinea': 'Oceania',
'Samoa': 'Oceania',
'Solomon Islands': 'Oceania',
'Tonga': 'Oceania',
'Tuvalu': 'Oceania',
'Vanuatu': 'Oceania',
# Africa
'Angola': 'Africa',
'Benin': 'Africa',
'Botswana': 'Africa',
'Burundi': 'Africa',
'Cameroon': 'Africa',
'Cape Verde': 'Africa',
'Central African Republic': 'Africa',
"Cote d'Ivoire": 'Africa',
'Democratic Republic of the Congo': 'Africa',
'Ethiopia': 'Africa',
'Ghana': 'Africa',
'Kenya': 'Africa',
'Lesotho': 'Africa',
'Liberia': 'Africa',
'Madagascar': 'Africa',
'Malawi': 'Africa',
'Mauritius': 'Africa',
'Mozambique': 'Africa',
'Namibia': 'Africa',
'Nigeria': 'Africa',
'Republic of the Congo': 'Africa',
'Reunion': 'Africa',
'Rwanda': 'Africa',
'Sierra Leone': 'Africa',
'South Africa': 'Africa',
'Swaziland': 'Africa', # Now officially Eswatini, but still mapped to Africa
'Tanzania': 'Africa',
'Togo': 'Africa',
'Uganda': 'Africa',
'Zambia': 'Africa',
'Zimbabwe': 'Africa'
}
country_data['region'] = country_data['Name'].apply(lambda x:country_region_map_full[x])
data["Birth country"] = data["Birth country"].replace({"United States":"USA"})
# Figure 1
data.region.value_counts(normalize=True)
country_data \
.groupby("region") \
.sum()['TotalChurchMembership'] \
.reset_index() \
.assign(prop = lambda x:x['TotalChurchMembership']/x['TotalChurchMembership'].sum())
# mexico quick look
country_data[country_data.Name == "Mexico"].TotalChurchMembership.iloc[0]/country_data.TotalChurchMembership.sum()
data['Birth country'].value_counts(normalize=True)
# Utah vs. US quick look
state_data[state_data.Name == "Utah"].TotalChurchMembership.iloc[0]/country_data[country_data.Name == "United States"].TotalChurchMembership.iloc[0]
# 32% of members are in Utah
data[data['Birth state'] == "Utah"].shape[0]/data[data['Birth country'] == "USA"].shape[0]
# 51% of leaders are from Utah
# Figure 2
# Career choice
st.tabs(ces_data.query("religpew == 3"), "industry", wts="commonweight", display="column")
ces_data['industry_rc'] = st.recode(ces_data, "industry", "10=NaN")
to_clip = st.tabs(ces_data.query("religpew == 3"), "industry_rc", wts="commonweight", display="column").to_list()
to_clip = list(map(str, to_clip))
pyperclip.copy('\n'.join(to_clip))
to_clip = st.tabs(data, "Profession category", display="column").to_list()
to_clip = list(map(str, to_clip))
pyperclip.copy('\n'.join(to_clip))
#figure 3
# education level
to_clip = st.tabs(ces_data.query("religpew == 3"), "educ", wts="commonweight", display="column").to_list()
to_clip = list(map(str, to_clip))
data['Bachelor Degree'].sum()/data.shape[0]
data['Graduate Degree'].sum()/data.shape[0]
pyperclip.copy('\n'.join(to_clip))
st.tabs(ces_data.query("religpew == 3"), "educ", wts="commonweight", display="column")
st.tabs(ces_data, "educ", wts="commonweight", display="column")
#figure 4
data[data.Gender == "Male"].shape[0]/data.shape[0]
# 94% Male
data[data.White == "Yes"].shape[0]/data.shape[0]
# 62% White
#figure 5
data[data.MBA].shape[0]/data.shape[0] #38%
data[data.JD].shape[0]/data.shape[0] #15%
data[data.PHD].shape[0]/data.shape[0] #7%
data[((data.MBA + data.JD + data.PHD)==0)&data['Graduate Degree']].shape[0]/data.shape[0]
#16%
#figure 6
data[data["Attended BYU"]].shape[0]/data.shape[0] #53%
data[data["Attended University of Utah"]].shape[0]/data.shape[0] #16%
#figure 7
#Q15? 8/15 53%
data[data["Ivy or Ivy Plus"]].shape[0]/data.shape[0] #18%
#there are 14 ivy or ivy plus schools
# Less than 1% of US college students attend ivy league schools and only ~34% have attended
#college. This means a fractiion of a percent of the US has attended ivy leagues
#not sure what the LDS number is, but it is likely quite small and perhaps similar and definitly not 18 or 53%.
#figure 8
tbl1 = data.region.value_counts(normalize=True).reset_index().rename(columns = {"index":"region", "region":"leadership_ratio"})
tbl2 = country_data \
.groupby("region") \
.sum()['TotalChurchMembership'] \
.reset_index() \
.assign(prop = lambda x:x['TotalChurchMembership']/x['TotalChurchMembership'].sum())
final = country_data \
.assign(MpC = lambda x:x['TotalChurchMembership']/x['Congregations']) \
.groupby("region") \
.mean()['MpC'] \
.reset_index() \
.merge(tbl1, on = "region") \
.merge(tbl2, on = "region") \
.assign(leadership_ratio = lambda x:x["leadership_ratio"] - x["prop"]) \
.assign(leadership_ratio = lambda x:round(x["leadership_ratio"]*100,2))
pyperclip.copy(final)
final['MpC'].corr(final['leadership_ratio'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment