Skip to content

Instantly share code, notes, and snippets.

@hurrifan1
Created October 27, 2021 16:24
Show Gist options
  • Save hurrifan1/a2e57ec22584d004014119aaa45bfe24 to your computer and use it in GitHub Desktop.
Save hurrifan1/a2e57ec22584d004014119aaa45bfe24 to your computer and use it in GitHub Desktop.
Using Python to access a password-protected page and scrape the data therein
# Using Python to access a password-protected page and scrape the data therein
#
# Make sure to install the Selenium and BeautifulSoup packages before running this script.
#
# Also, you need to install the ChromeDriver that matches your Chrome version:
# - https://chromedriver.chromium.org/downloads
#
# Things you can do to adapt and improve this script:
# - Install and use the python-dotenv package to use a .env file for storing your login credentials
# without accidentally exposing them to code reviewers.
# - Install and use the Google Drive packages for pushing the results CSV to Google Drive after it's got new data.
#
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
from datetime import date
from time import sleep
import csv
# Initialize the Chrome driver
driver = webdriver.Chrome()
# Open the login page
driver.get("https://bwc.ag")
sleep(2)
# Input credentials and then click the login button
driver.find_element_by_name("txtAccessOfCode").send_keys("JCisDouche")
sleep(1)
driver.find_element_by_name("txtAccessOfPassword").send_keys("veryPassword")
sleep(1)
driver.find_element_by_css_selector("input[value=login]").click()
sleep(1)
# Establish the fields/CSV headers
data_fields = [
"Business Name",
"Phone Number",
"License #",
"Physical Address",
"Issue Date",
"Expiration Date"
]
# Create a CSV for our data including the CSV headers
with open('data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(data_fields)
# Scrape the table data, iterating through each page
data = []
page = 1
#### Start iterating here! ####
done = False
while not done:
sleep(2)
print("#### Current Page: ", page)
print("#### Cumulative rows: ", data.__len__())
soup = bs(driver.page_source)
table = soup.find('table', attrs={"id": "GridView1"}).find("tbody")
rows = table.find_all("tr")
for i, row in enumerate(rows):
if i+1 == rows.__len__():
continue # Skip the final row
cols = row.find_all('td')
if cols.__len__() == 0:
continue # Skip header row
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
page += 1
next = driver.find_elements_by_link_text(page.__str__())
if next.__len__() != 0:
next[-1].click()
elif driver.find_elements_by_link_text("...").__len__() != 0:
driver.find_elements_by_link_text("...")[-1].click()
else:
done = True
# Save the results to the CSV
with open('data.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(data)
# Close the Chrome instance
driver.close()
@hurrifan1
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment