MCMXCIII/articlescrapperv2

## articlescrapperv2
# This version is just a clean up of the script. I used it while developing, uses functions and
# and has an Error Handling system. Thought you might find it use full.

import string
from typing import ContextManager
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import csv
import time
import os
import sys
import re


# Search for SALE UNDER POWER and fetch all URLS
def Get_URLs():

    try:
        time.sleep(2)
        search = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtSearch"]""")
        search.send_keys("SALE UNDER POWER")

        exact_phrase = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rdoType"]/li[3]/label""")
        exact_phrase.click()
        time.sleep(5)

        plus = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_divDateRange"]/label/a""")
        plus.click()
        data_range = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rbLastNumWeeks"]""")
        data_range.click()
        last_weeks = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtLastNumWeeks"]""")
        last_weeks.clear()
        last_weeks.send_keys(no_last_weeks)

        submit = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_btnGo"]""")
        submit.click()
        time.sleep(5)

        set_50 = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_ddlPerPage"]/option[7]""")
        set_50.click()
        time.sleep(5)

        total_pages = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_lblTotalPages"]""").text
        total_pages = int((total_pages.replace("of ","").replace(" Pages","")).strip())

        urls = []
        data_urls(urls)
        for _ in range(total_pages-1):
            next_page = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_btnNext"]""")
            next_page.click()
            time.sleep(5)
            data_urls(urls)

        return urls
    except Exception as error:
        ErrorHandler(error)

# Scrape Articles from urls list, and write them to .csv file
def Write_Article_Data(file_name: string, urls: list):

    try:
        for url in urls:
            driver.get(url)
            time.sleep(3)

            soup_x = BeautifulSoup(driver.page_source, 'html.parser')
            publication_name = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblPubName").text).strip()
            publication_url= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkPubURL").text).strip()
            publication_city= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCity").text).strip()
            publication_state= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblState").text).strip()
            publication_county= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCounty").text).strip()
            notice_keywords= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblKeywords").text).strip()
            notice_auth_no= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblNoticeAuthenticationNo").text).strip()
            notice_url = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkNoticeURL").text).strip()
            notice_publish_date = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblPublicationDAte").text).strip()
            content_text = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblContentText").text).replace('\n', '')
            address = re.findall("(known|to) as(:?) ([+ [\w ,\.\-\/\#']+ [\w \.\-']+ [-\dA-Z]+[0-9])", content_text)

            # Checking if found addresses list is empy and replacing it with "Null". Otherwise setting it to the 3 item in the returned tuple
            if address == []:
                address = "Null"
            else:
                address = address[0][2]

            with open(f'{file_name}.csv', 'a', newline='',encoding="utf-8") as new_file:
                csv_writer = csv.writer(new_file)
                csv_writer.writerow([publication_name,publication_url,publication_city,publication_state,publication_county,notice_keywords,notice_auth_no,notice_url,notice_publish_date,content_text,address])

        return True
    except Exception as error:
        ErrorHandler(error)

        return False

def data_urls(urls):
    try:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        items = soup.find_all("input",{"title":"Click to open the record."})
        for item in items:
            url = "https://www.georgiapublicnotice.com/" + item["onclick"].replace("javascript:location.href='","").replace("';return false;","")
            urls.append(url)
    except Exception as error:
        ErrorHandler(error)

# Handles any errors in Try-Except, and outputs them in a clean manner
def ErrorHandler(err):

    # get details about the exception
    err_type, err_obj, traceback = sys.exc_info()

    print("\n---------------------------------------------------------")
    print(
        "An error occured! Here is more detail\n")
    print("---------------------------------------------------------")

    print("\n ERROR:", err)
    print("Traceback:", traceback, "-- type:", err_type)
    print("Error occured on line: ", traceback.tb_lineno)

    # psycopg2 extensions.Diagnostics object attribute
    print("\nextensions.Diagnostics:", err.diag)

    print("Let's try again\n")
    print("---------------------------------------------------------")

    driver.quit()


if __name__ == "__main__":


    # Recieve user input
    no_last_weeks = str(input("Number of last weeks: ")).strip()
    file_name = str(input("Name of the output file to be saved: ")).strip()

    # Remove file if it already exists
    try:
        os.remove(f"{file_name}.csv")
    except FileNotFoundError:
        pass
    except Exception as error:
        ErrorHandler(error)

    # Start browser
    url = "https://www.georgiapublicnotice.com/"
    driver = webdriver.Chrome()
    driver.get(url)

    # Create or append to .csv file
    with open(f'{file_name}.csv', 'a', newline='') as new_file:
        csv_writer = csv.writer(new_file)
        csv_writer.writerow(["Publication Name","Publication Url","Publication City","Publication State","Publication County","Notice Keywords","Notice Auth No","Notice Url","Notice Publish Date","Content Text","Address"])


    # Scrape and write data to .csv file
    urls = Get_URLs()
    successfulScrape = Write_Article_Data(file_name, urls)

    if successfulScrape:
        print("Scrape was successful!")
    else:
        print("An unexpected error occured. Check the logs")

    driver.quit()
	# This version is just a clean up of the script. I used it while developing, uses functions and
	# and has an Error Handling system. Thought you might find it use full.

	import string
	from typing import ContextManager
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from bs4 import BeautifulSoup
	import csv
	import time
	import os
	import sys
	import re


	# Search for SALE UNDER POWER and fetch all URLS
	def Get_URLs():

	try:
	time.sleep(2)
	search = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtSearch"]""")
	search.send_keys("SALE UNDER POWER")

	exact_phrase = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rdoType"]/li[3]/label""")
	exact_phrase.click()
	time.sleep(5)

	plus = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_divDateRange"]/label/a""")
	plus.click()
	data_range = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_rbLastNumWeeks"]""")
	data_range.click()
	last_weeks = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_txtLastNumWeeks"]""")
	last_weeks.clear()
	last_weeks.send_keys(no_last_weeks)

	submit = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_as1_btnGo"]""")
	submit.click()
	time.sleep(5)

	set_50 = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_ddlPerPage"]/option[7]""")
	set_50.click()
	time.sleep(5)

	total_pages = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_lblTotalPages"]""").text
	total_pages = int((total_pages.replace("of ","").replace(" Pages","")).strip())

	urls = []
	data_urls(urls)
	for _ in range(total_pages-1):
	next_page = driver.find_element_by_xpath("""//*[@id="ctl00_ContentPlaceHolder1_WSExtendedGridNP1_GridView1_ctl01_btnNext"]""")
	next_page.click()
	time.sleep(5)
	data_urls(urls)

	return urls
	except Exception as error:
	ErrorHandler(error)

	# Scrape Articles from urls list, and write them to .csv file
	def Write_Article_Data(file_name: string, urls: list):

	try:
	for url in urls:
	driver.get(url)
	time.sleep(3)

	soup_x = BeautifulSoup(driver.page_source, 'html.parser')
	publication_name = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblPubName").text).strip()
	publication_url= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkPubURL").text).strip()
	publication_city= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCity").text).strip()
	publication_state= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblState").text).strip()
	publication_county= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblCounty").text).strip()
	notice_keywords= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblKeywords").text).strip()
	notice_auth_no= str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lblNoticeAuthenticationNo").text).strip()
	notice_url = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_PublicNoticeDetails1_lnkNoticeURL").text).strip()
	notice_publish_date = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblPublicationDAte").text).strip()
	content_text = str(soup_x.select_one("#ctl00_ContentPlaceHolder1_PublicNoticeDetailsBody1_lblContentText").text).replace('\n', '')
	address = re.findall("(known\|to) as(:?) ([+ [\w ,\.\-\/\#']+ [\w \.\-']+ [-\dA-Z]+[0-9])", content_text)

	# Checking if found addresses list is empy and replacing it with "Null". Otherwise setting it to the 3 item in the returned tuple
	if address == []:
	address = "Null"
	else:
	address = address[0][2]

	with open(f'{file_name}.csv', 'a', newline='',encoding="utf-8") as new_file:
	csv_writer = csv.writer(new_file)
	csv_writer.writerow([publication_name,publication_url,publication_city,publication_state,publication_county,notice_keywords,notice_auth_no,notice_url,notice_publish_date,content_text,address])

	return True
	except Exception as error:
	ErrorHandler(error)

	return False

	def data_urls(urls):
	try:
	soup = BeautifulSoup(driver.page_source, 'html.parser')
	items = soup.find_all("input",{"title":"Click to open the record."})
	for item in items:
	url = "https://www.georgiapublicnotice.com/" + item["onclick"].replace("javascript:location.href='","").replace("';return false;","")
	urls.append(url)
	except Exception as error:
	ErrorHandler(error)

	# Handles any errors in Try-Except, and outputs them in a clean manner
	def ErrorHandler(err):

	# get details about the exception
	err_type, err_obj, traceback = sys.exc_info()

	print("\n---------------------------------------------------------")
	print(
	"An error occured! Here is more detail\n")
	print("---------------------------------------------------------")

	print("\n ERROR:", err)
	print("Traceback:", traceback, "-- type:", err_type)
	print("Error occured on line: ", traceback.tb_lineno)

	# psycopg2 extensions.Diagnostics object attribute
	print("\nextensions.Diagnostics:", err.diag)

	print("Let's try again\n")
	print("---------------------------------------------------------")

	driver.quit()


	if __name__ == "__main__":


	# Recieve user input
	no_last_weeks = str(input("Number of last weeks: ")).strip()
	file_name = str(input("Name of the output file to be saved: ")).strip()

	# Remove file if it already exists
	try:
	os.remove(f"{file_name}.csv")
	except FileNotFoundError:
	pass
	except Exception as error:
	ErrorHandler(error)

	# Start browser
	url = "https://www.georgiapublicnotice.com/"
	driver = webdriver.Chrome()
	driver.get(url)

	# Create or append to .csv file
	with open(f'{file_name}.csv', 'a', newline='') as new_file:
	csv_writer = csv.writer(new_file)
	csv_writer.writerow(["Publication Name","Publication Url","Publication City","Publication State","Publication County","Notice Keywords","Notice Auth No","Notice Url","Notice Publish Date","Content Text","Address"])


	# Scrape and write data to .csv file
	urls = Get_URLs()
	successfulScrape = Write_Article_Data(file_name, urls)

	if successfulScrape:
	print("Scrape was successful!")
	else:
	print("An unexpected error occured. Check the logs")

	driver.quit()