Created
November 7, 2020 07:41
-
-
Save leehanchung/1cff2d66df6b77acb9354440ff793acf to your computer and use it in GitHub Desktop.
detecting election fraud using Benford's Law
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################## | |
# Author: Han-chung Lee | |
# Date: 2020-11-6 | |
# | |
# Detecting election fraud activity at Milwaukee County using Benford's Law | |
# | |
# Statistical detection of systematic election irregularities: | |
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3478593/ | |
############################################################################## | |
from collections import Counter | |
from math import log10 | |
from pprint import pprint | |
from typing import List, Dict, Tuple, Union | |
from bs4 import BeautifulSoup as bs | |
import matplotlib.axes as axes | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import requests | |
from scipy.stats import chisquare | |
url = "https://county.milwaukee.gov/EN/County-Clerk/Off-Nav/Election-Results/Election-Results-Fall-2020" | |
candidates = ['Biden', 'Trump', 'Blankenship', 'Jorgensen', 'Carroll'] | |
def get_soup(url:str): | |
"""get html data from input url and load it into beautifulsoup parser | |
Args: | |
url (str): source url | |
Returns: | |
beautifulsoup parser with loaded data | |
""" | |
headers = { | |
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0" | |
} | |
response = requests.get(url, headers=headers) | |
if response.status_code != requests.codes.ok: | |
print("ERROR") | |
return None | |
data = response.text | |
soup = bs(data, "html.parser") | |
return soup | |
def get_benfords() -> List[float]: | |
"""get list of expected discrete benford distribution values to be used | |
for chi square test | |
Returns: | |
List[float]: discrete benford's law distribution | |
""" | |
benfords = [log10(1 + 1/d) for d in range(1, 10)] | |
return benfords | |
def data_parsing(soup) -> Dict[str, List[str]]: | |
"""Take a beautiful soup object and parse it into a dict with | |
corresponding candidate names and lists of their vote counts for precincts | |
Args: | |
soup ([type]): [description] | |
Returns: | |
Dict[str, List[str]]: [description] | |
""" | |
precinct_table = soup.findAll('table', {'class': 'precinctTable'}) | |
federal_table = precinct_table[1] | |
data = { | |
"Num": [], | |
"Ward": [], | |
"Biden": [], | |
"Trump": [], | |
'Blankenship': [], | |
'Jorgensen': [], | |
'Carroll': [], | |
'writein': [] | |
} | |
count = 1 | |
header = True | |
for val in federal_table.findAll('td'): | |
if header and count != 8: | |
count += 1 | |
continue | |
elif header and count == 8: | |
count = 1 | |
header = False | |
continue | |
if count == 1: | |
data['Num'].append(val.text) | |
elif count == 2: | |
data['Ward'].append(val.text) | |
elif count == 3: | |
data['Biden'].append(val.text) | |
elif count == 4: | |
data['Trump'].append(val.text) | |
elif count == 5: | |
data['Blankenship'].append(val.text) | |
elif count == 6: | |
data['Jorgensen'].append(val.text) | |
elif count == 7: | |
data['Carroll'].append(val.text) | |
elif count == 8: | |
data['writein'].append(val.text) | |
count = 0 | |
count += 1 | |
# remove last data point, which is the total of all vote counts | |
data = {k: v[:-1] for k, v in data.items()} | |
return data | |
def preprocess( | |
data: Dict[str, List[str]] | |
) -> Tuple[Dict[str, List[int]], Dict[str, List[float]]]: | |
"""preprocess a dictionary of candidate name and votes and generate counts | |
of the leading digits and the expected values using Benford's Law | |
Args: | |
data (Dict[str, List[str]]): candidate: vote counts for all prescints | |
Returns: | |
Tuple[Dict[str, List[int]], Dict[str, List[float]]]: observed and | |
expected leading digit count for all candidates | |
""" | |
# get leading digit for precinct votes of all candidates | |
data = { | |
k: list(map(lambda x: int(str(x)[0]), v)) | |
for k, v in data.items() if k in candidates | |
} | |
# get counter of leading digits for all candidates | |
data = {k: Counter(v) for k, v in data.items()} | |
# sorted count of leading digits for all candidates, where index 0 | |
# correspond to counts of 1, index 2 correspond to counts of 2 | |
data_observed = { | |
k: list(dict(sorted(v.items(), key = lambda x: x[0])).values())[1:] | |
for k, v in data.items() | |
} | |
benfords = get_benfords() | |
data_expected = { | |
k: [sum(v) * b for b in benfords][:len(v)] | |
for k, v in data_observed.items() | |
} | |
data_observed['axis'] = [1, 2, 3, 4, 5, 6, 7, 8, 9] | |
data_expected['axis'] = [1, 2, 3, 4, 5, 6, 7, 8, 9] | |
return data_observed, data_expected | |
def visualize(expected: List[Union[int, float]], observed: List[Union[int, float]], candidate:str = None) -> axes._subplots: | |
if not candidate: | |
fig = sns.barplot(data=expected, x='axis', y='Biden', color='gray') | |
fig.set(xlabel='leading digits', ylabel='counts', title="Benford's Law distribution") | |
return fig | |
ax = sns.barplot(x='axis', | |
y=candidate, | |
data=expected, | |
color='gray', | |
saturation=0.5, | |
alpha=0.2, | |
label="expected") | |
ax2 = ax.twinx() | |
ax2.plot(ax.get_xticks(), observed[candidate], label='observed') | |
ax2.set_ylim(0, 150) | |
ax.set(xlabel='leading digits', ylabel='counts', title=f"{candidate} - Benford's Count - Observed vs Expected") | |
ax.legend(bbox_to_anchor=(1, 0.92)) | |
ax2.legend() | |
return ax | |
def main(): | |
soup = get_soup(url) | |
data = data_parsing(soup) | |
observed, expected = preprocess(data) | |
fig1 = visualize(expected=expected, observed=observed) | |
plt.show() | |
fig2 = visualize(candidate="Biden", expected=expected, observed=observed) | |
plt.show() | |
fig3 = visualize(candidate="Trump", expected=expected, observed=observed) | |
plt.show() | |
# H_o: vote count data fits Benford's Law distribution | |
# H_a: vote count data does not fit Benford's Law distribution | |
# | |
# all the pvalue are well under 0.05, thus not enough confidence to reject | |
# null hypothesis. | |
chi_square_results = { | |
candidate: chisquare(observed[candidate], expected[candidate]) | |
for candidate in candidates | |
} | |
pprint(chi_square_results) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment