hurrifan1/Scrape protected site.py

## Scrape protected site.py
# Using Python to access a password-protected page and scrape the data therein
#
# Make sure to install the Selenium and BeautifulSoup packages before running this script.
#
# Also, you need to install the ChromeDriver that matches your Chrome version:
#  - https://chromedriver.chromium.org/downloads
#
# Things you can do to adapt and improve this script:
#  - Install and use the python-dotenv package to use a .env file for storing your login credentials
#      without accidentally exposing them to code reviewers.
#  - Install and use the Google Drive packages for pushing the results CSV to Google Drive after it's got new data.
#


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
from datetime import date
from time import sleep
import csv

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the login page
driver.get("https://bwc.ag")
sleep(2)

# Input credentials and then click the login button
driver.find_element_by_name("txtAccessOfCode").send_keys("JCisDouche")
sleep(1)
driver.find_element_by_name("txtAccessOfPassword").send_keys("veryPassword")
sleep(1)
driver.find_element_by_css_selector("input[value=login]").click()
sleep(1)

# Establish the fields/CSV headers
data_fields = [
    "Business Name",
    "Phone Number",
    "License #",
    "Physical Address",
    "Issue Date",
    "Expiration Date"
]

# Create a CSV for our data including the CSV headers
with open('data.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(data_fields)

# Scrape the table data, iterating through each page
data = []
page = 1

####  Start iterating here!  ####
done = False
while not done:
    sleep(2)

    print("#### Current Page: ", page)
    print("#### Cumulative rows: ", data.__len__())

    soup = bs(driver.page_source)
    table = soup.find('table', attrs={"id": "GridView1"}).find("tbody")

    rows = table.find_all("tr")
    for i, row in enumerate(rows):
        if i+1 == rows.__len__():
            continue          # Skip the final row
        cols = row.find_all('td')
        if cols.__len__() == 0:
            continue            # Skip header row
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele])   # Get rid of empty values

    page += 1
    next = driver.find_elements_by_link_text(page.__str__())

    if next.__len__() != 0:
        next[-1].click()
    elif driver.find_elements_by_link_text("...").__len__() != 0:
        driver.find_elements_by_link_text("...")[-1].click()
    else:
        done = True


# Save the results to the CSV
with open('data.csv', 'a', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(data)

# Close the Chrome instance
driver.close()
	# Using Python to access a password-protected page and scrape the data therein
	#
	# Make sure to install the Selenium and BeautifulSoup packages before running this script.
	#
	# Also, you need to install the ChromeDriver that matches your Chrome version:
	# - https://chromedriver.chromium.org/downloads
	#
	# Things you can do to adapt and improve this script:
	# - Install and use the python-dotenv package to use a .env file for storing your login credentials
	# without accidentally exposing them to code reviewers.
	# - Install and use the Google Drive packages for pushing the results CSV to Google Drive after it's got new data.
	#


	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from bs4 import BeautifulSoup as bs
	from datetime import date
	from time import sleep
	import csv

	# Initialize the Chrome driver
	driver = webdriver.Chrome()

	# Open the login page
	driver.get("https://bwc.ag")
	sleep(2)

	# Input credentials and then click the login button
	driver.find_element_by_name("txtAccessOfCode").send_keys("JCisDouche")
	sleep(1)
	driver.find_element_by_name("txtAccessOfPassword").send_keys("veryPassword")
	sleep(1)
	driver.find_element_by_css_selector("input[value=login]").click()
	sleep(1)

	# Establish the fields/CSV headers
	data_fields = [
	"Business Name",
	"Phone Number",
	"License #",
	"Physical Address",
	"Issue Date",
	"Expiration Date"
	]

	# Create a CSV for our data including the CSV headers
	with open('data.csv', 'w', newline='') as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(data_fields)

	# Scrape the table data, iterating through each page
	data = []
	page = 1

	#### Start iterating here! ####
	done = False
	while not done:
	sleep(2)

	print("#### Current Page: ", page)
	print("#### Cumulative rows: ", data.__len__())

	soup = bs(driver.page_source)
	table = soup.find('table', attrs={"id": "GridView1"}).find("tbody")

	rows = table.find_all("tr")
	for i, row in enumerate(rows):
	if i+1 == rows.__len__():
	continue # Skip the final row
	cols = row.find_all('td')
	if cols.__len__() == 0:
	continue # Skip header row
	cols = [ele.text.strip() for ele in cols]
	data.append([ele for ele in cols if ele]) # Get rid of empty values

	page += 1
	next = driver.find_elements_by_link_text(page.__str__())

	if next.__len__() != 0:
	next[-1].click()
	elif driver.find_elements_by_link_text("...").__len__() != 0:
	driver.find_elements_by_link_text("...")[-1].click()
	else:
	done = True


	# Save the results to the CSV
	with open('data.csv', 'a', newline='') as csvfile:
	writer = csv.writer(csvfile)
	writer.writerows(data)

	# Close the Chrome instance
	driver.close()