Skip to content

Instantly share code, notes, and snippets.

@MCMXCIII
Last active February 7, 2021 19:16
Embed
What would you like to do?
Scraping script
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import csv
import time
import os
import re
no_last_weeks = str(input("Number of last weeks: ")).strip()
file_name = str(input("Name of the output file to be saved: ")).strip()
try:
os.remove(f"{file_name}.csv")
except:
pass
def data_urls():
soup = BeautifulSoup(driver.page_source, 'html.parser')
items = soup.find_all("input",{"title":"Click to open the record."})
for item in items:
url = "https://www.georgiapublicnotice.com/" + item["onclick"].replace("javascript:location.href='","").replace("';return false;","")
urls.append(url)
try:
url = "https://www.georgiapublicnotice.com/"
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
search = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtSearch"]""")
search.send_keys("SALE UNDER POWER")
exact_phrase = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rdoType"]/li[3]/label""")
exact_phrase.click()
time.sleep(5)
plus = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_divDateRange"]/label/a""")
plus.click()
data_range = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rbLastNumWeeks"]""")
data_range.click()
last_weeks = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtLastNumWeeks"]""")
last_weeks.clear()
last_weeks.send_keys(no_last_weeks)
submit = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_btnGo"]""")
submit.click()
time.sleep(5)
set_50 = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_ddlPerPage"]/option[7]""")
set_50.click()
time.sleep(5)
total_pages = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_lblTotalPages"]""").text
total_pages = int((total_pages.replace("of ","").replace(" Pages","")).strip())
urls = []
data_urls()
for _ in range(total_pages-1):
next_page = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_btnNext"]""")
next_page.click()
time.sleep(5)
data_urls()
with open(f'{file_name}.csv', 'a', newline='') as new_file:
csv_writer = csv.writer(new_file)
csv_writer.writerow(["Publication Name","Publication Url","Publication City","Publication State","Publication County","Notice Keywords","Notice Auth No","Notice Url","Notice Publish Date","Content Text","Address"])
for url in urls:
driver.get(url)
time.sleep(3)
soup_x = BeautifulSoup(driver.page_source, 'html.parser')
publication_name = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblPubName").text).strip()
publication_url= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkPubURL").text).strip()
publication_city= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCity").text).strip()
publication_state= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblState").text).strip()
publication_county= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCounty").text).strip()
notice_keywords= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblKeywords").text).strip()
notice_auth_no= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblNoticeAuthenticationNo").text).strip()
notice_url = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkNoticeURL").text).strip()
notice_publish_date = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblPublicationDAte").text).strip()
content_text = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblContentText").text).replace('\n', '')
address = str(re.findall("is known as (\d+ [\w ,\.\-\/\#']+), ([\w \.\-']+), ([A-Z]{2}) ([-\dA-Z]+)",content_text)).text
with open(f'{file_name}.csv', 'a', newline='',encoding="utf-8") as new_file:
csv_writer = csv.writer(new_file)
csv_writer.writerow([publication_name,publication_url,publication_city,publication_state,publication_county,notice_keywords,notice_auth_no,notice_url,notice_publish_date,content_text,address])
driver.quit()
except:
driver.quit()
print("please try again...")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment