Created
October 27, 2021 16:24
-
-
Save hurrifan1/a2e57ec22584d004014119aaa45bfe24 to your computer and use it in GitHub Desktop.
Using Python to access a password-protected page and scrape the data therein
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Using Python to access a password-protected page and scrape the data therein | |
# | |
# Make sure to install the Selenium and BeautifulSoup packages before running this script. | |
# | |
# Also, you need to install the ChromeDriver that matches your Chrome version: | |
# - https://chromedriver.chromium.org/downloads | |
# | |
# Things you can do to adapt and improve this script: | |
# - Install and use the python-dotenv package to use a .env file for storing your login credentials | |
# without accidentally exposing them to code reviewers. | |
# - Install and use the Google Drive packages for pushing the results CSV to Google Drive after it's got new data. | |
# | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from bs4 import BeautifulSoup as bs | |
from datetime import date | |
from time import sleep | |
import csv | |
# Initialize the Chrome driver | |
driver = webdriver.Chrome() | |
# Open the login page | |
driver.get("https://bwc.ag") | |
sleep(2) | |
# Input credentials and then click the login button | |
driver.find_element_by_name("txtAccessOfCode").send_keys("JCisDouche") | |
sleep(1) | |
driver.find_element_by_name("txtAccessOfPassword").send_keys("veryPassword") | |
sleep(1) | |
driver.find_element_by_css_selector("input[value=login]").click() | |
sleep(1) | |
# Establish the fields/CSV headers | |
data_fields = [ | |
"Business Name", | |
"Phone Number", | |
"License #", | |
"Physical Address", | |
"Issue Date", | |
"Expiration Date" | |
] | |
# Create a CSV for our data including the CSV headers | |
with open('data.csv', 'w', newline='') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(data_fields) | |
# Scrape the table data, iterating through each page | |
data = [] | |
page = 1 | |
#### Start iterating here! #### | |
done = False | |
while not done: | |
sleep(2) | |
print("#### Current Page: ", page) | |
print("#### Cumulative rows: ", data.__len__()) | |
soup = bs(driver.page_source) | |
table = soup.find('table', attrs={"id": "GridView1"}).find("tbody") | |
rows = table.find_all("tr") | |
for i, row in enumerate(rows): | |
if i+1 == rows.__len__(): | |
continue # Skip the final row | |
cols = row.find_all('td') | |
if cols.__len__() == 0: | |
continue # Skip header row | |
cols = [ele.text.strip() for ele in cols] | |
data.append([ele for ele in cols if ele]) # Get rid of empty values | |
page += 1 | |
next = driver.find_elements_by_link_text(page.__str__()) | |
if next.__len__() != 0: | |
next[-1].click() | |
elif driver.find_elements_by_link_text("...").__len__() != 0: | |
driver.find_elements_by_link_text("...")[-1].click() | |
else: | |
done = True | |
# Save the results to the CSV | |
with open('data.csv', 'a', newline='') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerows(data) | |
# Close the Chrome instance | |
driver.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@jzientek