TrevorMcCormick/cosmetology_scrape.py

## cosmetology_scrape.py
#Import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd

df = pd.DataFrame(columns=['License Type', 'Name', 'Main Address*', 'Mailing Address',
                           'Name Type', 'License Number', 'Rank', 'Status', 'Expires'])

#For loop through 103 pages
for page in range(1, 104):
   #Get search results from url
   url = "https://www.myfloridalicense.com/wl11.asp?mode=3&search=&SID=&brd=&typ="
   data = {
       "hSearchOpt":"Organization",
       "hSearchAltName":"Alt",
       "hDivision":"ALL",
       #Cosmetologists
       "hBoard":"05",
       "hLicenseType":"0501",
       #Manatee county
       "hCounty":"51",
       "hState":"FL",
       "hCurrPage":"1",
       "hTotalPages":"103",
       "hTotalRecords":"5145",
       "hPageAction":"1",
       #Max results
       "hRecsPerPage":"50",
       #Loop through page number
       "Page":"{0}".format(page),
       "SearchGo.x":"0",
       "SearchGo.y":"0"
   }
   response = requests.post(url, data=data)
   doc = BeautifulSoup(response.content, "html.parser")
   # Grab all the cosmetologists from table
   table = doc.find("table", attrs={'bgcolor':'#b6c9dc'})
   rows = table.find_all("tr")[1:]

   # Cosmetologists have five cells, addresses have one
   for n in range(0,50):
       # Grab first five cells in row
       start = n*5
       end = start+5
       # Grab address cell
       start_addresses = n
       try:
           # Cosmetologist cells span one column, addresses span six
           cosmetologist = rows[0].find_all("td", attrs={"colspan":"1"})[start:end]
           addresses = rows[0].find_all("td", attrs={"align":"left", "colspan":"6"})
           addresses = [start_addresses]
       except:
           print('done')
           # Get out of the for loop once you reach the end of the list
           break
       # Try to get main address
       try:
           main = addresses.find_all("td")[1].text.strip()
       except:
           main = None

       # Try to get mailing address
       try:
           mailing = addresses.find_all("td")[3].text.strip()
       except:
           mailing = None

       # Try to get rank
       try:
           rank = cosmetologist[3].get_text(separator=",").split(',')[1]
       except:
           rank = None

       # Try to get rank
       try:
           expires = cosmetologist[4].get_text(separator=",").split(',')[1]
       except:
           expires = None

       # Write dictionary that includes all table contents in one row
       row = {
           'License Type': cosmetologist[0].text.strip(),
           'Name': cosmetologist[1].text.strip(),
           'Main Address*': main,
           "Mailing Address": mailing,
           'Name Type': cosmetologist[2].text.strip(),
           "License Number": cosmetologist[3].get_text(separator=",").split(',')[0],
           "Rank": rank,
           "Status": cosmetologist[4].get_text(separator=",").split(',')[0],
           "Expires": expires
       }
       # Write row to dict
       df = df.append(row, ignore_index=True)
   print('Completed page: {0}'.format(page))

# Write df to csv
df.to_csv('cosmetologists.csv', index=False)
	#Import packages
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd

	df = pd.DataFrame(columns=['License Type', 'Name', 'Main Address*', 'Mailing Address',
	'Name Type', 'License Number', 'Rank', 'Status', 'Expires'])

	#For loop through 103 pages
	for page in range(1, 104):
	#Get search results from url
	url = "https://www.myfloridalicense.com/wl11.asp?mode=3&search=&SID=&brd=&typ="
	data = {
	"hSearchOpt":"Organization",
	"hSearchAltName":"Alt",
	"hDivision":"ALL",
	#Cosmetologists
	"hBoard":"05",
	"hLicenseType":"0501",
	#Manatee county
	"hCounty":"51",
	"hState":"FL",
	"hCurrPage":"1",
	"hTotalPages":"103",
	"hTotalRecords":"5145",
	"hPageAction":"1",
	#Max results
	"hRecsPerPage":"50",
	#Loop through page number
	"Page":"{0}".format(page),
	"SearchGo.x":"0",
	"SearchGo.y":"0"
	}
	response = requests.post(url, data=data)
	doc = BeautifulSoup(response.content, "html.parser")
	# Grab all the cosmetologists from table
	table = doc.find("table", attrs={'bgcolor':'#b6c9dc'})
	rows = table.find_all("tr")[1:]

	# Cosmetologists have five cells, addresses have one
	for n in range(0,50):
	# Grab first five cells in row
	start = n*5
	end = start+5
	# Grab address cell
	start_addresses = n
	try:
	# Cosmetologist cells span one column, addresses span six
	cosmetologist = rows[0].find_all("td", attrs={"colspan":"1"})[start:end]
	addresses = rows[0].find_all("td", attrs={"align":"left", "colspan":"6"})
	addresses = [start_addresses]
	except:
	print('done')
	# Get out of the for loop once you reach the end of the list
	break
	# Try to get main address
	try:
	main = addresses.find_all("td")[1].text.strip()
	except:
	main = None

	# Try to get mailing address
	try:
	mailing = addresses.find_all("td")[3].text.strip()
	except:
	mailing = None

	# Try to get rank
	try:
	rank = cosmetologist[3].get_text(separator=",").split(',')[1]
	except:
	rank = None

	# Try to get rank
	try:
	expires = cosmetologist[4].get_text(separator=",").split(',')[1]
	except:
	expires = None

	# Write dictionary that includes all table contents in one row
	row = {
	'License Type': cosmetologist[0].text.strip(),
	'Name': cosmetologist[1].text.strip(),
	'Main Address*': main,
	"Mailing Address": mailing,
	'Name Type': cosmetologist[2].text.strip(),
	"License Number": cosmetologist[3].get_text(separator=",").split(',')[0],
	"Rank": rank,
	"Status": cosmetologist[4].get_text(separator=",").split(',')[0],
	"Expires": expires
	}
	# Write row to dict
	df = df.append(row, ignore_index=True)
	print('Completed page: {0}'.format(page))

	# Write df to csv
	df.to_csv('cosmetologists.csv', index=False)