Skip to content

Instantly share code, notes, and snippets.

@gitdagray
Last active June 10, 2020 18:22
Show Gist options
  • Save gitdagray/a13e3993cf0239b802a1fe578f6b7107 to your computer and use it in GitHub Desktop.
Save gitdagray/a13e3993cf0239b802a1fe578f6b7107 to your computer and use it in GitHub Desktop.
Full Python code for web scraping Kanview utilizing Selenium, Beautiful Soup, and pandas
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
from tabulate import tabulate
import os
#launch url
url = "http://kanview.ks.gov/PayRates/PayRates_Agency.aspx"
# create a new Firefox session
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)
#After opening the url above, Selenium clicks the specific agency link
python_button = driver.find_element_by_id('MainContent_uxLevel1_Agencies_uxAgencyBtn_33') #FHSU
python_button.click() #click fhsu link
#Selenium hands the page source to Beautiful Soup
soup_level1=BeautifulSoup(driver.page_source, 'lxml')
datalist = [] #empty list
x = 0 #counter
#Beautiful Soup finds all Job Title links on the agency page and the loop begins
for link in soup_level1.find_all('a', id=re.compile("^MainContent_uxLevel2_JobTitles_uxJobTitleBtn_")):
#Selenium visits each Job Title page
python_button = driver.find_element_by_id('MainContent_uxLevel2_JobTitles_uxJobTitleBtn_' + str(x))
python_button.click() #click link
#Selenium hands of the source of the specific job page to Beautiful Soup
soup_level2=BeautifulSoup(driver.page_source, 'lxml')
#Beautiful Soup grabs the HTML table on the page
table = soup_level2.find_all('table')[0]
#Giving the HTML table to pandas to put in a dataframe object
df = pd.read_html(str(table),header=0)
#Store the dataframe in a list
datalist.append(df[0])
#Ask Selenium to click the back button
driver.execute_script("window.history.go(-1)")
#increment the counter variable before starting the loop over
x += 1
#end loop block
#loop has completed
#end the Selenium browser session
driver.quit()
#combine all pandas dataframes in the list into one big dataframe
result = pd.concat([pd.DataFrame(datalist[i]) for i in range(len(datalist))],ignore_index=True)
#convert the pandas dataframe to JSON
json_records = result.to_json(orient='records')
#pretty print to CLI with tabulate
#converts to an ascii table
print(tabulate(result, headers=["Employee Name","Job Title","Overtime Pay","Total Gross Pay"],tablefmt='psql'))
#get current working directory
path = os.getcwd()
#open, write, and close the file
f = open(path + "\\fhsu_payroll_data.json","w") #FHSU
f.write(json_records)
f.close()
@root-94
Copy link

root-94 commented Sep 14, 2018

How could this be modified to save as CSV instead of JSON?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment