Skip to content

Instantly share code, notes, and snippets.

@MCMXCIII
Created May 20, 2022 16:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MCMXCIII/2197dcf3544b79eb7c47ed56639b4a65 to your computer and use it in GitHub Desktop.
Save MCMXCIII/2197dcf3544b79eb7c47ed56639b4a65 to your computer and use it in GitHub Desktop.
# This version is just a clean up of the script. I used it while developing, uses functions and
# and has an Error Handling system. Thought you might find it use full.
import string
from typing import ContextManager
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import csv
import time
import os
import sys
import re
# Search for SALE UNDER POWER and fetch all URLS
def Get_URLs():
try:
time.sleep(2)
search = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtSearch"]""")
search.send_keys("SALE UNDER POWER")
exact_phrase = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rdoType"]/li[3]/label""")
exact_phrase.click()
time.sleep(5)
plus = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_divDateRange"]/label/a""")
plus.click()
data_range = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rbLastNumWeeks"]""")
data_range.click()
last_weeks = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtLastNumWeeks"]""")
last_weeks.clear()
last_weeks.send_keys(no_last_weeks)
submit = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_btnGo"]""")
submit.click()
time.sleep(5)
set_50 = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_ddlPerPage"]/option[7]""")
set_50.click()
time.sleep(5)
total_pages = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_lblTotalPages"]""").text
total_pages = int((total_pages.replace("of ","").replace(" Pages","")).strip())
urls = []
data_urls(urls)
for _ in range(total_pages-1):
next_page = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_btnNext"]""")
next_page.click()
time.sleep(5)
data_urls(urls)
return urls
except Exception as error:
ErrorHandler(error)
# Scrape Articles from urls list, and write them to .csv file
def Write_Article_Data(file_name: string, urls: list):
try:
for url in urls:
driver.get(url)
time.sleep(3)
soup_x = BeautifulSoup(driver.page_source, 'html.parser')
publication_name = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblPubName").text).strip()
publication_url= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkPubURL").text).strip()
publication_city= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCity").text).strip()
publication_state= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblState").text).strip()
publication_county= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCounty").text).strip()
notice_keywords= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblKeywords").text).strip()
notice_auth_no= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblNoticeAuthenticationNo").text).strip()
notice_url = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkNoticeURL").text).strip()
notice_publish_date = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblPublicationDAte").text).strip()
content_text = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblContentText").text).replace('\n', '')
address = re.findall("(known|to) as(:?) ([+ [\w ,\.\-\/\#']+ [\w \.\-']+ [-\dA-Z]+[0-9])", content_text)
# Checking if found addresses list is empy and replacing it with "Null". Otherwise setting it to the 3 item in the returned tuple
if address == []:
address = "Null"
else:
address = address[0][2]
with open(f'{file_name}.csv', 'a', newline='',encoding="utf-8") as new_file:
csv_writer = csv.writer(new_file)
csv_writer.writerow([publication_name,publication_url,publication_city,publication_state,publication_county,notice_keywords,notice_auth_no,notice_url,notice_publish_date,content_text,address])
return True
except Exception as error:
ErrorHandler(error)
return False
def data_urls(urls):
try:
soup = BeautifulSoup(driver.page_source, 'html.parser')
items = soup.find_all("input",{"title":"Click to open the record."})
for item in items:
url = "https://www.georgiapublicnotice.com/" + item["onclick"].replace("javascript:location.href='","").replace("';return false;","")
urls.append(url)
except Exception as error:
ErrorHandler(error)
# Handles any errors in Try-Except, and outputs them in a clean manner
def ErrorHandler(err):
# get details about the exception
err_type, err_obj, traceback = sys.exc_info()
print("\n---------------------------------------------------------")
print(
"An error occured! Here is more detail\n")
print("---------------------------------------------------------")
print("\n ERROR:", err)
print("Traceback:", traceback, "-- type:", err_type)
print("Error occured on line: ", traceback.tb_lineno)
# psycopg2 extensions.Diagnostics object attribute
print("\nextensions.Diagnostics:", err.diag)
print("Let's try again\n")
print("---------------------------------------------------------")
driver.quit()
if __name__ == "__main__":
# Recieve user input
no_last_weeks = str(input("Number of last weeks: ")).strip()
file_name = str(input("Name of the output file to be saved: ")).strip()
# Remove file if it already exists
try:
os.remove(f"{file_name}.csv")
except FileNotFoundError:
pass
except Exception as error:
ErrorHandler(error)
# Start browser
url = "https://www.georgiapublicnotice.com/"
driver = webdriver.Chrome()
driver.get(url)
# Create or append to .csv file
with open(f'{file_name}.csv', 'a', newline='') as new_file:
csv_writer = csv.writer(new_file)
csv_writer.writerow(["Publication Name","Publication Url","Publication City","Publication State","Publication County","Notice Keywords","Notice Auth No","Notice Url","Notice Publish Date","Content Text","Address"])
# Scrape and write data to .csv file
urls = Get_URLs()
successfulScrape = Write_Article_Data(file_name, urls)
if successfulScrape:
print("Scrape was successful!")
else:
print("An unexpected error occured. Check the logs")
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment