Skip to content

Instantly share code, notes, and snippets.

@TrevorMcCormick
Last active May 4, 2020 03:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TrevorMcCormick/3e08890b8a74846b55bc847c54cda38c to your computer and use it in GitHub Desktop.
Save TrevorMcCormick/3e08890b8a74846b55bc847c54cda38c to your computer and use it in GitHub Desktop.
Cosmetology Scrape
#Import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.DataFrame(columns=['License Type', 'Name', 'Main Address*', 'Mailing Address',
'Name Type', 'License Number', 'Rank', 'Status', 'Expires'])
#For loop through 103 pages
for page in range(1, 104):
#Get search results from url
url = "https://www.myfloridalicense.com/wl11.asp?mode=3&search=&SID=&brd=&typ="
data = {
"hSearchOpt":"Organization",
"hSearchAltName":"Alt",
"hDivision":"ALL",
#Cosmetologists
"hBoard":"05",
"hLicenseType":"0501",
#Manatee county
"hCounty":"51",
"hState":"FL",
"hCurrPage":"1",
"hTotalPages":"103",
"hTotalRecords":"5145",
"hPageAction":"1",
#Max results
"hRecsPerPage":"50",
#Loop through page number
"Page":"{0}".format(page),
"SearchGo.x":"0",
"SearchGo.y":"0"
}
response = requests.post(url, data=data)
doc = BeautifulSoup(response.content, "html.parser")
# Grab all the cosmetologists from table
table = doc.find("table", attrs={'bgcolor':'#b6c9dc'})
rows = table.find_all("tr")[1:]
# Cosmetologists have five cells, addresses have one
for n in range(0,50):
# Grab first five cells in row
start = n*5
end = start+5
# Grab address cell
start_addresses = n
try:
# Cosmetologist cells span one column, addresses span six
cosmetologist = rows[0].find_all("td", attrs={"colspan":"1"})[start:end]
addresses = rows[0].find_all("td", attrs={"align":"left", "colspan":"6"})
addresses = [start_addresses]
except:
print('done')
# Get out of the for loop once you reach the end of the list
break
# Try to get main address
try:
main = addresses.find_all("td")[1].text.strip()
except:
main = None
# Try to get mailing address
try:
mailing = addresses.find_all("td")[3].text.strip()
except:
mailing = None
# Try to get rank
try:
rank = cosmetologist[3].get_text(separator=",").split(',')[1]
except:
rank = None
# Try to get rank
try:
expires = cosmetologist[4].get_text(separator=",").split(',')[1]
except:
expires = None
# Write dictionary that includes all table contents in one row
row = {
'License Type': cosmetologist[0].text.strip(),
'Name': cosmetologist[1].text.strip(),
'Main Address*': main,
"Mailing Address": mailing,
'Name Type': cosmetologist[2].text.strip(),
"License Number": cosmetologist[3].get_text(separator=",").split(',')[0],
"Rank": rank,
"Status": cosmetologist[4].get_text(separator=",").split(',')[0],
"Expires": expires
}
# Write row to dict
df = df.append(row, ignore_index=True)
print('Completed page: {0}'.format(page))
# Write df to csv
df.to_csv('cosmetologists.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment