Created
December 24, 2016 02:58
-
-
Save rachelanddata/e028515cf37916aaf799bacdf76d5ab3 to your computer and use it in GitHub Desktop.
Webscraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# import modules\n", | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"url = 'http://www.paywizard.org/main/salary/minimum-wage'\n", | |
"\n", | |
"# Scrape the HTML at the url\n", | |
"r = requests.get(url)\n", | |
"\n", | |
"# Turn the HTML into a Beautiful Soup object\n", | |
"soup = BeautifulSoup(r.text, 'lxml')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create four variables to score the scraped data in\n", | |
"state_name = []\n", | |
"hourly_min_wage = []\n", | |
"notes = []\n", | |
"\n", | |
"# Create an object of the second object that is class=grid listing table\n", | |
"table = soup.findAll(class_='grid listing table')[1]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Find all the <tr> tag pairs, skip the first one, then for each.\n", | |
"for row in table.find_all('tr')[1:]:\n", | |
" # Create a variable of all the <td> tag pairs in each <tr> tag pair,\n", | |
" col = row.find_all('td')\n", | |
"\n", | |
" # Create a variable of the string inside 1st <td> tag pair,\n", | |
" column_1 = col[0].text.strip()\n", | |
" # and append it to first_name variable\n", | |
" state_name.append(column_1)\n", | |
"\n", | |
" # Create a variable of the string inside 2nd <td> tag pair,\n", | |
" column_2 = col[1].text.strip()\n", | |
" # and append it to last_name variable\n", | |
" hourly_min_wage.append(column_2)\n", | |
"\n", | |
" # Create a variable of the string inside 3rd <td> tag pair,\n", | |
" column_3 = col[2].text.strip()\n", | |
" # and append it to age variable\n", | |
" notes.append(column_3)\n", | |
"\n", | |
"\n", | |
"# Create a variable of the value of the columns\n", | |
"columns = {'state': state_name, 'hrly_min_wage': hourly_min_wage, 'notes': notes }\n", | |
"\n", | |
"# Create a dataframe from the columns variable\n", | |
"df = pd.DataFrame(columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>state</th>\n", | |
" <th>hrly_min_wage</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>ALABAMA</td>\n", | |
" <td>NONE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>ALASKA</td>\n", | |
" <td>$9.75</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>ARIZONA</td>\n", | |
" <td>$8.05</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>ARKANSAS</td>\n", | |
" <td>$8.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>CALIFORNIA</td>\n", | |
" <td>$10.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>COLORADO</td>\n", | |
" <td>$8.31</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>CONNECTICUT</td>\n", | |
" <td>$9.60</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>DELAWARE</td>\n", | |
" <td>$8.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>DISTRICT OF COLUMBIA</td>\n", | |
" <td>$11.50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>FLORIDA</td>\n", | |
" <td>$8.05</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>GEORGIA</td>\n", | |
" <td>$5.15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>HAWAII</td>\n", | |
" <td>$8.50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>IDAHO</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>ILLINOIS</td>\n", | |
" <td>$8.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>INDIANA</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>IOWA</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>KANSAS</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>KENTUCKY</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>LOUISIANA</td>\n", | |
" <td>NONE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>MAINE</td>\n", | |
" <td>$7.50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>MARYLAND</td>\n", | |
" <td>$8.75</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>MASSACHUSETTS</td>\n", | |
" <td>$10.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>MICHIGAN</td>\n", | |
" <td>$8.50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>MINNESOTA</td>\n", | |
" <td>$9.50/7.75</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>MISSISSIPPI</td>\n", | |
" <td>NONE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>MISSOURI</td>\n", | |
" <td>$7.65</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td>MONTANA</td>\n", | |
" <td>$8.05</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>NEBRASKA</td>\n", | |
" <td>$9.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>NEVADA</td>\n", | |
" <td>$8.25/7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29</th>\n", | |
" <td>NEW HAMPSHIRE</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>30</th>\n", | |
" <td>NEW JERSEY</td>\n", | |
" <td>$8.38</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>31</th>\n", | |
" <td>NEW MEXICO</td>\n", | |
" <td>$7.50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>32</th>\n", | |
" <td>NEW YORK</td>\n", | |
" <td>$9.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>33</th>\n", | |
" <td>Building service janitors in residential build...</td>\n", | |
" <td>$6.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>34</th>\n", | |
" <td>All employees except janitors in residential b...</td>\n", | |
" <td>$9.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>35</th>\n", | |
" <td>N. CAROLINA</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>36</th>\n", | |
" <td>N. DAKOTA</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>37</th>\n", | |
" <td>OHIO</td>\n", | |
" <td>$8.10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>38</th>\n", | |
" <td>OKLAHOMA</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>39</th>\n", | |
" <td>OREGON</td>\n", | |
" <td>$9.50/9.75</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>40</th>\n", | |
" <td>PENNSYLVANIA</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>41</th>\n", | |
" <td>RHODE ISLAND</td>\n", | |
" <td>$9.60</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>42</th>\n", | |
" <td>S. CAROLINA</td>\n", | |
" <td>NONE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>43</th>\n", | |
" <td>S. DAKOTA</td>\n", | |
" <td>$8.55</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>44</th>\n", | |
" <td>TENNESSEE</td>\n", | |
" <td>NONE</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>45</th>\n", | |
" <td>TEXAS</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>46</th>\n", | |
" <td>UTAH</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>47</th>\n", | |
" <td>VERMONT</td>\n", | |
" <td>$9.60</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>48</th>\n", | |
" <td>VIRGINIA</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>49</th>\n", | |
" <td>WASHINGTON</td>\n", | |
" <td>$9.47</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50</th>\n", | |
" <td>WEST VIRGINIA</td>\n", | |
" <td>$8.75</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>51</th>\n", | |
" <td>WISCONSIN</td>\n", | |
" <td>$7.25</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>52</th>\n", | |
" <td>WYOMING</td>\n", | |
" <td>$5.15</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" state hrly_min_wage\n", | |
"0 ALABAMA NONE\n", | |
"1 ALASKA $9.75\n", | |
"2 ARIZONA $8.05\n", | |
"3 ARKANSAS $8.00\n", | |
"4 CALIFORNIA $10.00\n", | |
"5 COLORADO $8.31\n", | |
"6 CONNECTICUT $9.60\n", | |
"7 DELAWARE $8.25\n", | |
"8 DISTRICT OF COLUMBIA $11.50\n", | |
"9 FLORIDA $8.05\n", | |
"10 GEORGIA $5.15\n", | |
"11 HAWAII $8.50\n", | |
"12 IDAHO $7.25\n", | |
"13 ILLINOIS $8.25\n", | |
"14 INDIANA $7.25\n", | |
"15 IOWA $7.25\n", | |
"16 KANSAS $7.25\n", | |
"17 KENTUCKY $7.25\n", | |
"18 LOUISIANA NONE\n", | |
"19 MAINE $7.50\n", | |
"20 MARYLAND $8.75\n", | |
"21 MASSACHUSETTS $10.00\n", | |
"22 MICHIGAN $8.50\n", | |
"23 MINNESOTA $9.50/7.75\n", | |
"24 MISSISSIPPI NONE\n", | |
"25 MISSOURI $7.65\n", | |
"26 MONTANA $8.05\n", | |
"27 NEBRASKA $9.00\n", | |
"28 NEVADA $8.25/7.25\n", | |
"29 NEW HAMPSHIRE $7.25\n", | |
"30 NEW JERSEY $8.38\n", | |
"31 NEW MEXICO $7.50\n", | |
"32 NEW YORK $9.00\n", | |
"33 Building service janitors in residential build... $6.00\n", | |
"34 All employees except janitors in residential b... $9.00\n", | |
"35 N. CAROLINA $7.25\n", | |
"36 N. DAKOTA $7.25\n", | |
"37 OHIO $8.10\n", | |
"38 OKLAHOMA $7.25\n", | |
"39 OREGON $9.50/9.75\n", | |
"40 PENNSYLVANIA $7.25\n", | |
"41 RHODE ISLAND $9.60\n", | |
"42 S. CAROLINA NONE\n", | |
"43 S. DAKOTA $8.55\n", | |
"44 TENNESSEE NONE\n", | |
"45 TEXAS $7.25\n", | |
"46 UTAH $7.25\n", | |
"47 VERMONT $9.60\n", | |
"48 VIRGINIA $7.25\n", | |
"49 WASHINGTON $9.47\n", | |
"50 WEST VIRGINIA $8.75\n", | |
"51 WISCONSIN $7.25\n", | |
"52 WYOMING $5.15" | |
] | |
}, | |
"execution_count": 37, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df[['state','hrly_min_wage']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df.drop('notes',axis=1,inplace=True) # drop the notes column" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# write the dataframe to a table, drop the index column\n", | |
"df.to_csv('min_wage_per_state.csv', index=False) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [Root]", | |
"language": "python", | |
"name": "Python [Root]" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment