Created
April 10, 2024 16:03
-
-
Save DiogenesAnalytics/33d62b946bd163fadc837b3ede9e0a9f to your computer and use it in GitHub Desktop.
Scraping US state population data from Wikipedia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Rank in states & territories 2020 | Rank in states & territories 2010 | State | Census population April 1 2020 | Census population April 1 2010 | Percent change 2010–2020 | Absolute change 2010-2020 | Total seats in the U.S. House of Representatives 2023–2033 | Census population per electoral vote | Census population per House seat | Percent of the total U.S. population 2020 | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | California | 39538223 | 37253956 | 6.1 | 2284267 | 52 | 732189 | 760350 | 11.80 | |
2 | 2 | Texas | 29145505 | 25145561 | 15.9 | 3999944 | 38 | 728638 | 766987 | 8.70 | |
3 | 4 | Florida | 21538187 | 18801310 | 14.6 | 2736877 | 28 | 717940 | 769221 | 6.43 | |
4 | 3 | New York | 20201249 | 19378102 | 4.2 | 823147 | 26 | 721473 | 776971 | 6.03 | |
5 | 6 | Pennsylvania | 13002700 | 12702379 | 2.4 | 300321 | 17 | 684353 | 764865 | 3.88 | |
6 | 5 | Illinois | 12812508 | 12830632 | −0.1 | −18124 | 17 | 674343 | 753677 | 3.82 | |
7 | 7 | Ohio | 11799448 | 11536504 | 2.3 | 262944 | 15 | 694085 | 786630 | 3.52 | |
8 | 9 | Georgia | 10711908 | 9687653 | 10.6 | 1024255 | 14 | 669494 | 765136 | 3.20 | |
9 | 10 | North Carolina | 10439388 | 9535483 | 9.5 | 903905 | 14 | 652462 | 745671 | 3.12 | |
10 | 8 | Michigan | 10077331 | 9883640 | 2.0 | 193691 | 13 | 671822 | 775179 | 3.01 | |
11 | 11 | New Jersey | 9288994 | 8791894 | 5.7 | 497100 | 12 | 663500 | 774083 | 2.77 | |
12 | 12 | Virginia | 8631393 | 8001024 | 7.9 | 630369 | 11 | 663953 | 784672 | 2.58 | |
13 | 13 | Washington | 7705281 | 6724540 | 14.6 | 980741 | 10 | 642107 | 770528 | 2.30 | |
14 | 16 | Arizona | 7151502 | 6392017 | 11.9 | 759485 | 9 | 650137 | 794611 | 2.13 | |
15 | 14 | Massachusetts | 7029917 | 6547629 | 7.4 | 482288 | 9 | 639083 | 781102 | 2.10 | |
16 | 17 | Tennessee | 6910840 | 6346105 | 8.9 | 564735 | 9 | 628258 | 767871 | 2.06 | |
17 | 15 | Indiana | 6785528 | 6483802 | 4.7 | 301726 | 9 | 616866 | 753948 | 2.03 | |
18 | 19 | Maryland | 6177224 | 5773552 | 7.0 | 403672 | 8 | 617722 | 772153 | 1.84 | |
19 | 18 | Missouri | 6154913 | 5988927 | 2.8 | 165986 | 8 | 615491 | 769364 | 1.84 | |
20 | 20 | Wisconsin | 5893718 | 5686986 | 3.6 | 206732 | 8 | 589372 | 736715 | 1.76 | |
21 | 22 | Colorado | 5773714 | 5029196 | 14.8 | 744518 | 8 | 577371 | 721714 | 1.72 | |
22 | 21 | Minnesota | 5706494 | 5303925 | 7.6 | 402569 | 8 | 570649 | 713312 | 1.70 | |
23 | 24 | South Carolina | 5118425 | 4625364 | 10.7 | 493061 | 7 | 568714 | 731204 | 1.53 | |
24 | 23 | Alabama | 5024279 | 4779736 | 5.1 | 244543 | 7 | 558253 | 717754 | 1.50 | |
25 | 25 | Louisiana | 4657757 | 4533372 | 2.7 | 124385 | 6 | 582220 | 776293 | 1.39 | |
26 | 26 | Kentucky | 4505836 | 4339367 | 3.8 | 166469 | 6 | 563230 | 750973 | 1.35 | |
27 | 27 | Oregon | 4237256 | 3831074 | 10.6 | 406182 | 6 | 529657 | 706209 | 1.27 | |
28 | 28 | Oklahoma | 3959353 | 3751351 | 5.5 | 208002 | 5 | 565622 | 791871 | 1.18 | |
29 | 30 | Connecticut | 3605944 | 3574097 | 0.9 | 31847 | 5 | 515135 | 721189 | 1.08 | |
30 | 29 | Puerto Rico | 3285874 | 3725789 | −11.8 | −439915 | 1 | NaN | NaN | 0.98 | |
31 | 35 | Utah | 3271616 | 2763885 | 18.4 | 507731 | 4 | 545269 | 817904 | 0.98 | |
32 | 31 | Iowa | 3190369 | 3046355 | 4.7 | 144014 | 4 | 531728 | 797592 | 0.95 | |
33 | 36 | Nevada | 3104614 | 2700551 | 15.0 | 404063 | 4 | 517436 | 776154 | 0.93 | |
34 | 33 | Arkansas | 3011524 | 2915918 | 3.3 | 95606 | 4 | 501921 | 752881 | 0.90 | |
35 | 32 | Mississippi | 2961279 | 2967297 | −0.2 | −6018 | 4 | 493547 | 740320 | 0.88 | |
36 | 34 | Kansas | 2937880 | 2853118 | 3.0 | 84762 | 4 | 489647 | 734470 | 0.88 | |
37 | 37 | New Mexico | 2117522 | 2059179 | 2.8 | 58343 | 3 | 423504 | 705841 | 0.63 | |
38 | 39 | Nebraska | 1961504 | 1826341 | 7.4 | 135163 | 3 | 392301 | 653835 | 0.59 | |
39 | 40 | Idaho | 1839106 | 1567582 | 17.3 | 271524 | 2 | 459777 | 919553 | 0.55 | |
40 | 38 | West Virginia | 1793716 | 1852994 | −3.2 | −59278 | 2 | 448429 | 896858 | 0.54 | |
41 | 41 | Hawaii | 1455271 | 1360301 | 7.0 | 94970 | 2 | 363818 | 727636 | 0.43 | |
42 | 43 | New Hampshire | 1377529 | 1316470 | 4.6 | 61059 | 2 | 344382 | 688765 | 0.41 | |
43 | 42 | Maine | 1362359 | 1328361 | 2.6 | 33998 | 2 | 340590 | 681180 | 0.41 | |
44 | 44 | Rhode Island | 1097379 | 1052567 | 4.3 | 44812 | 2 | 274345 | 548690 | 0.33 | |
45 | 45 | Montana | 1084225 | 989415 | 9.6 | 94810 | 2 | 271056 | 542113 | 0.32 | |
46 | 46 | Delaware | 989948 | 897934 | 10.2 | 92014 | 1 | 329983 | 989948 | 0.30 | |
47 | 47 | South Dakota | 886667 | 814180 | 8.9 | 72487 | 1 | 295556 | 886667 | 0.27 | |
48 | 49 | North Dakota | 779094 | 672591 | 15.8 | 106503 | 1 | 259698 | 779094 | 0.23 | |
49 | 48 | Alaska | 733391 | 710231 | 3.3 | 23160 | 1 | 244464 | 733391 | 0.22 | |
50 | 51 | District of Columbia | 689545 | 601723 | 14.6 | 87822 | 1 | 229848 | NaN | 0.21 | |
51 | 50 | Vermont | 643077 | 625741 | 2.8 | 17336 | 1 | 214359 | 643077 | 0.19 | |
52 | 52 | Wyoming | 576851 | 563626 | 2.3 | 13225 | 1 | 192284 | 576851 | 0.17 | |
53 | 53 | Guam | 153836 | 159358 | −3.5 | −5522 | 1 | NaN | NaN | 0.05 | |
54 | 54 | U.S. Virgin Islands | 87146 | 106405 | −18.1 | −19259 | 1 | NaN | NaN | 0.03 | |
55 | 55 | American Samoa | 49710 | 55519 | −10.5 | −5809 | 1 | NaN | NaN | 0.01 | |
56 | 56 | Northern Mariana Islands | 47329 | 53883 | −12.2 | −6554 | 1 | NaN | NaN | 0.01 | |
NaN | NaN | Contiguous United States | 329260619 | 306675006 | 7.4 | 22585613 | 432 | 627163 | 757745 | 98.27 | |
NaN | NaN | The fifty states | 330759736 | 308143815 | 7.3 | 22615921 | 435 | 621729 | 755796 | 98.71 | |
NaN | NaN | Fifty states D.C. | 331449281 | 308745538 | 7.4 | 22703743 | 435 | 619531 | NaN | 98.92 | |
NaN | NaN | Total U.S. (including D.C. and territories) | 335073176 | 312846492 | 7.1 | 22226684 | 435 | NaN | NaN | 100.00 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "zbpNWGif3una" | |
}, | |
"source": [ | |
"# Scraping Wikipedia U.S. States by Population Data\n", | |
"This notebook simply pulls the [*U.S. States Population Data*](https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population) data down from *Wikipedia* and saves it to a *CSV* file." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "nLqshbIu4ZUx" | |
}, | |
"source": [ | |
"## Setup\n", | |
"First we need to the *scraping libraries* ..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "950N0MZz1BXK" | |
}, | |
"outputs": [], | |
"source": [ | |
"# get necessary scraping tools\n", | |
"!pip install beautifulsoup4 requests" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "rXa1NZY54i88" | |
}, | |
"source": [ | |
"## Scraping\n", | |
"Now we can scrape the *Wikipedia* page and save the data into the *CSV* file `country_hdi.csv`." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "MIqrdHP31JOQ" | |
}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"import csv\n", | |
"\n", | |
"def remove_citations(text):\n", | |
" \"\"\"Removes in lingering Wikipedia citation syntax.\"\"\"\n", | |
" # regex for grabbing citations\n", | |
" citation_pattern = r'\\[\\w\\]|\\[\\d+\\]|\\[note\\s*\\d+\\]'\n", | |
" \n", | |
" # use re.sub() to remove citations from the text\n", | |
" text_without_citations = re.sub(citation_pattern, '', text)\n", | |
"\n", | |
" # get cleaned text\n", | |
" return text_without_citations\n", | |
"\n", | |
"def sanitize_csv(text):\n", | |
" \"\"\"All necessary sanitization steps for storing in CSV.\"\"\"\n", | |
" # check cases\n", | |
" match text.strip():\n", | |
" case \"—\":\n", | |
" # check for NaN\n", | |
" sanitized = re.sub(r\"—\", \"NaN\", text)\n", | |
" \n", | |
" case _ if \"non-voting\" in text:\n", | |
" # remove non voting string\n", | |
" sanitized = re.sub(r\"\\([^()]+\\)\", \"\", text)\n", | |
" \n", | |
" case _:\n", | |
" # firt remove citations\n", | |
" cite_free_text = remove_citations(text)\n", | |
" \n", | |
" # now remove unwanted characters\n", | |
" sanitized = re.sub(r'[,\\n\\xa0%+]', '', cite_free_text)\n", | |
"\n", | |
" # cleaned\n", | |
" return sanitized.strip()\n", | |
"\n", | |
"# URL of the Wikipedia page containing the table\n", | |
"url = \"https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population\"\n", | |
"\n", | |
"# send a GET request to the URL\n", | |
"response = requests.get(url)\n", | |
"\n", | |
"# parse the HTML content\n", | |
"soup = BeautifulSoup(response.text, 'html.parser')\n", | |
"\n", | |
"# find the table containing the data\n", | |
"table = soup.find('table', class_='wikitable')\n", | |
"\n", | |
"# initialize a list to store the table data\n", | |
"data = []\n", | |
"\n", | |
"# iterate over the rows of the table\n", | |
"for row in table.find_all('tr'):\n", | |
" # initialize a list to store the data for each row\n", | |
" row_data = []\n", | |
" \n", | |
" # iterate over the cells (columns) in the row\n", | |
" for cell in row.find_all(['td', 'th']):\n", | |
" # append the cell content to the row data list\n", | |
" row_data.append(sanitize_csv(cell.get_text()))\n", | |
"\n", | |
" # check if the row contains any non-empty cells\n", | |
" if any(row_data): \n", | |
" # append the row data to the main data list\n", | |
" data.append(row_data)\n", | |
"\n", | |
"# define the path to save the CSV file\n", | |
"csv_file = \"pop_us_state_2024.csv\"\n", | |
"\n", | |
"# write the data to a CSV file\n", | |
"with open(csv_file, 'w', newline='') as file:\n", | |
" writer = csv.writer(file)\n", | |
" writer.writerows(data)\n", | |
"\n", | |
"# notify when done\n", | |
"print(f\"Data has been scraped and saved to: {csv_file}\")" | |
] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment