Skip to content

Instantly share code, notes, and snippets.

@rachelanddata
Created December 24, 2016 02:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rachelanddata/e028515cf37916aaf799bacdf76d5ab3 to your computer and use it in GitHub Desktop.
Save rachelanddata/e028515cf37916aaf799bacdf76d5ab3 to your computer and use it in GitHub Desktop.
Webscraper
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# import modules\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"url = 'http://www.paywizard.org/main/salary/minimum-wage'\n",
"\n",
"# Scrape the HTML at the url\n",
"r = requests.get(url)\n",
"\n",
"# Turn the HTML into a Beautiful Soup object\n",
"soup = BeautifulSoup(r.text, 'lxml')"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Create four variables to score the scraped data in\n",
"state_name = []\n",
"hourly_min_wage = []\n",
"notes = []\n",
"\n",
"# Create an object of the second object that is class=grid listing table\n",
"table = soup.findAll(class_='grid listing table')[1]\n"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Find all the <tr> tag pairs, skip the first one, then for each.\n",
"for row in table.find_all('tr')[1:]:\n",
" # Create a variable of all the <td> tag pairs in each <tr> tag pair,\n",
" col = row.find_all('td')\n",
"\n",
" # Create a variable of the string inside 1st <td> tag pair,\n",
" column_1 = col[0].text.strip()\n",
" # and append it to first_name variable\n",
" state_name.append(column_1)\n",
"\n",
" # Create a variable of the string inside 2nd <td> tag pair,\n",
" column_2 = col[1].text.strip()\n",
" # and append it to last_name variable\n",
" hourly_min_wage.append(column_2)\n",
"\n",
" # Create a variable of the string inside 3rd <td> tag pair,\n",
" column_3 = col[2].text.strip()\n",
" # and append it to age variable\n",
" notes.append(column_3)\n",
"\n",
"\n",
"# Create a variable of the value of the columns\n",
"columns = {'state': state_name, 'hrly_min_wage': hourly_min_wage, 'notes': notes }\n",
"\n",
"# Create a dataframe from the columns variable\n",
"df = pd.DataFrame(columns)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state</th>\n",
" <th>hrly_min_wage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ALABAMA</td>\n",
" <td>NONE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ALASKA</td>\n",
" <td>$9.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ARIZONA</td>\n",
" <td>$8.05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ARKANSAS</td>\n",
" <td>$8.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CALIFORNIA</td>\n",
" <td>$10.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>COLORADO</td>\n",
" <td>$8.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>CONNECTICUT</td>\n",
" <td>$9.60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>DELAWARE</td>\n",
" <td>$8.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>DISTRICT OF COLUMBIA</td>\n",
" <td>$11.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>FLORIDA</td>\n",
" <td>$8.05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>GEORGIA</td>\n",
" <td>$5.15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>HAWAII</td>\n",
" <td>$8.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>IDAHO</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>ILLINOIS</td>\n",
" <td>$8.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>INDIANA</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>IOWA</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>KANSAS</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>KENTUCKY</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>LOUISIANA</td>\n",
" <td>NONE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>MAINE</td>\n",
" <td>$7.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>MARYLAND</td>\n",
" <td>$8.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>MASSACHUSETTS</td>\n",
" <td>$10.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>MICHIGAN</td>\n",
" <td>$8.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>MINNESOTA</td>\n",
" <td>$9.50/7.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>MISSISSIPPI</td>\n",
" <td>NONE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>MISSOURI</td>\n",
" <td>$7.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>MONTANA</td>\n",
" <td>$8.05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>NEBRASKA</td>\n",
" <td>$9.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>NEVADA</td>\n",
" <td>$8.25/7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>NEW HAMPSHIRE</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>NEW JERSEY</td>\n",
" <td>$8.38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>NEW MEXICO</td>\n",
" <td>$7.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>NEW YORK</td>\n",
" <td>$9.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Building service janitors in residential build...</td>\n",
" <td>$6.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>All employees except janitors in residential b...</td>\n",
" <td>$9.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>N. CAROLINA</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>N. DAKOTA</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>OHIO</td>\n",
" <td>$8.10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>OKLAHOMA</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>OREGON</td>\n",
" <td>$9.50/9.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>PENNSYLVANIA</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>RHODE ISLAND</td>\n",
" <td>$9.60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>S. CAROLINA</td>\n",
" <td>NONE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>S. DAKOTA</td>\n",
" <td>$8.55</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>TENNESSEE</td>\n",
" <td>NONE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>TEXAS</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>UTAH</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>VERMONT</td>\n",
" <td>$9.60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>VIRGINIA</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>WASHINGTON</td>\n",
" <td>$9.47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50</th>\n",
" <td>WEST VIRGINIA</td>\n",
" <td>$8.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>WISCONSIN</td>\n",
" <td>$7.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52</th>\n",
" <td>WYOMING</td>\n",
" <td>$5.15</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state hrly_min_wage\n",
"0 ALABAMA NONE\n",
"1 ALASKA $9.75\n",
"2 ARIZONA $8.05\n",
"3 ARKANSAS $8.00\n",
"4 CALIFORNIA $10.00\n",
"5 COLORADO $8.31\n",
"6 CONNECTICUT $9.60\n",
"7 DELAWARE $8.25\n",
"8 DISTRICT OF COLUMBIA $11.50\n",
"9 FLORIDA $8.05\n",
"10 GEORGIA $5.15\n",
"11 HAWAII $8.50\n",
"12 IDAHO $7.25\n",
"13 ILLINOIS $8.25\n",
"14 INDIANA $7.25\n",
"15 IOWA $7.25\n",
"16 KANSAS $7.25\n",
"17 KENTUCKY $7.25\n",
"18 LOUISIANA NONE\n",
"19 MAINE $7.50\n",
"20 MARYLAND $8.75\n",
"21 MASSACHUSETTS $10.00\n",
"22 MICHIGAN $8.50\n",
"23 MINNESOTA $9.50/7.75\n",
"24 MISSISSIPPI NONE\n",
"25 MISSOURI $7.65\n",
"26 MONTANA $8.05\n",
"27 NEBRASKA $9.00\n",
"28 NEVADA $8.25/7.25\n",
"29 NEW HAMPSHIRE $7.25\n",
"30 NEW JERSEY $8.38\n",
"31 NEW MEXICO $7.50\n",
"32 NEW YORK $9.00\n",
"33 Building service janitors in residential build... $6.00\n",
"34 All employees except janitors in residential b... $9.00\n",
"35 N. CAROLINA $7.25\n",
"36 N. DAKOTA $7.25\n",
"37 OHIO $8.10\n",
"38 OKLAHOMA $7.25\n",
"39 OREGON $9.50/9.75\n",
"40 PENNSYLVANIA $7.25\n",
"41 RHODE ISLAND $9.60\n",
"42 S. CAROLINA NONE\n",
"43 S. DAKOTA $8.55\n",
"44 TENNESSEE NONE\n",
"45 TEXAS $7.25\n",
"46 UTAH $7.25\n",
"47 VERMONT $9.60\n",
"48 VIRGINIA $7.25\n",
"49 WASHINGTON $9.47\n",
"50 WEST VIRGINIA $8.75\n",
"51 WISCONSIN $7.25\n",
"52 WYOMING $5.15"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[['state','hrly_min_wage']]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.drop('notes',axis=1,inplace=True) # drop the notes column"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# write the dataframe to a table, drop the index column\n",
"df.to_csv('min_wage_per_state.csv', index=False) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment