Skip to content

Instantly share code, notes, and snippets.

@podolskyi
Created January 15, 2016 14:14
Show Gist options
  • Save podolskyi/d3fbba049b184adb3a08 to your computer and use it in GitHub Desktop.
Save podolskyi/d3fbba049b184adb3a08 to your computer and use it in GitHub Desktop.
# coding=utf-8
import random
import time
import re
import time
import datetime
from selenium import webdriver
from user_agent import generate_user_agent
# from pyvirtualdisplay import Display
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def start_browser_phjs():
try:
user_agent = generate_user_agent()
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = user_agent
browser = webdriver.PhantomJS(desired_capabilities=dcap)
browser.set_window_size(1280, 854)
browser.implicitly_wait(10)
browser.wait = WebDriverWait(browser, 10)
browser.set_page_load_timeout(60)
except:
return False
return browser
def _start_browser_ff():
"""Run browser and connect proxy"""
display = Display(visible=0, size=(1280, 854))
display.start()
profile = webdriver.FirefoxProfile()
user_agent = generate_user_agent()
profile.set_preference("general.useragent.override", user_agent)
br = webdriver.Firefox(profile)
br.implicitly_wait(10)
br.set_page_load_timeout(60)
return br
def _start_br():
display = Display(visible=0, size=(1280, 854))
display.start()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--proxy-server=http://{0}".format("159.203.120.202:8888"))
chrome_options.add_argument("--accept-language=en-US;q=0.8,en;q=0.4")
chrome_options.add_argument("--Referer=https://www.owler.com")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,/"
"like Gecko) Chrome/46.0.2490.71 Safari/537.36")
browser = webdriver.Chrome(chrome_options=chrome_options)
return browser
def get_case_table_info(case_numbers, br):
today = datetime.date.today()
# today = datetime.datetime.now()
pbtcases = open("PBTCases_" + str(today) + ".txt", "w")
pbtcases.write("CaseNumber|FileDate|Nature of Proceeding|CourtNo|Status" + '\n')
pbtparties = open("PBTParties_" + str(today) + ".txt", "w")
pbtparties.write("CaseNumber|PartyRole|PName|PAddr|PAddr2|AName|AAddr|AAddr2|AttyPhone" + "\n")
pbtevents = open("PBTEvents_" + str(today) + ".txt", "w")
pbtevents.write("CaseNumber|DocumentID|FileDate|EventDesc|Comments|Number of Pages" + "\n")
for i, case_number in enumerate(case_numbers):
br.find_element_by_id("ctl00_ContentPlaceHolder1_txtCaseNo").clear()
br.find_element_by_id("ctl00_ContentPlaceHolder1_txtCaseNo").send_keys(case_number)
br.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearch").click()
case_table = br.find_element_by_id("ctl00_ContentPlaceHolder1_GridViewTitle")
# case = br.find_element_by_id("ctl00_ContentPlaceHolder1_GridViewTitle").find_elements_by_xpath("tbody/tr")[1]
case = case_table.find_elements_by_xpath("tbody/tr")[1].find_elements_by_xpath("td")
case_number = case_number
file_date = case[2].text.strip()
nature_of_proceeding = case[5].text.strip()
court_no = case[1].text.strip()
status = case[4].text.strip()
print("Info CN:", case_number)
print("-------------------------------------------------------------------------------------")
print(case_number, "|", file_date, "|", nature_of_proceeding, "|", court_no, "|", status)
print("-------------------------------------------------------------------------------------")
pbtcases.write(case_number + "|" + file_date + "|" + nature_of_proceeding + "|" + court_no + "|" + status + '\n')
br = get_parties(case[-1], br, pbtparties)
events_link = br.find_element_by_xpath('//table[@id="ctl00_ContentPlaceHolder1_GridViewTitle"]/tbody/tr/td')
br = get_events(events_link, br, pbtevents)
# print("=" * 10)
pbtcases.close()
pbtparties.close()
pbtevents.close()
return br
def get_parties(parties_link, br, pbtparties):
try:
parties_link.find_element_by_xpath("a").click()
except NoSuchElementException:
print("No parties")
return br
try:
parties_table = br.find_element_by_id("ctl00_ContentPlaceHolder1_GridViewParties2")
except NoSuchElementException:
print("No parties")
return br
parties = parties_table.find_elements_by_xpath("tbody/tr")
if parties:
parties = parties[1:]
print("----====Parties====----")
for i, row in enumerate(parties):
partie = row.find_elements_by_xpath("td")
case_number = partie[0].text.strip()
partyrole = partie[1].text.strip()
pname = partie[2].find_element_by_id(
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblPName").text
paddr = partie[2].find_element_by_id(
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblPAddr").text
paddr2 = partie[2].find_element_by_id(
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblPAddr2").text
aname = partie[3].find_element_by_id(
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblAName").text
aaddr = partie[3].find_element_by_id(
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblAAddr").text
aaddr2 = partie[3].find_element_by_id(
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblAAddr2").text
attyphone = partie[3].find_element_by_id(
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblAttyPhone").text
print("------------------------------------------------------------------------------------------------------")
print(case_number, "|", partyrole, "|", pname, "|", paddr, "|", paddr2, "|", aname, "|", aaddr, "|"
, aaddr2, "|", attyphone)
print("------------------------------------------------------------------------------------------------------")
pbtparties.write(case_number + "|" + partyrole + "|" + pname + "|" + paddr + "|" + paddr2 + "|" + aname +
"|" + aaddr + "|" + aaddr2 + "|" + attyphone + "\n")
return br
def get_events(events_link, br, pbtevents):
try:
events_link.find_element_by_xpath("a").click()
except NoSuchElementException:
return br
try:
events_table = br.find_element_by_id("itemPlaceholderContainer")
except NoSuchElementException:
# print("No events")
return br
events = events_table.find_elements_by_xpath("tbody/tr")
print("----====Events====----")
for row in events:
event = row.find_elements_by_xpath("td")
case_number = event[0].find_element_by_xpath("span").text
document_id = event[5].find_element_by_xpath("a").text
file_date = event[1].find_element_by_xpath("span").text
event_desc = event[2].find_element_by_xpath("span").text
comments = event[3].find_element_by_xpath("span").text
number_of_page = event[4].find_element_by_xpath("a").text
print("------------------------------------------------------------------------------------------------------")
print(case_number, "|", document_id, "|", file_date, "|", event_desc, "|", comments, "|", number_of_page)
print("------------------------------------------------------------------------------------------------------")
pbtevents.write(case_number + "|" + document_id + "|" + file_date + "|" +
event_desc + "|" + comments + "|" + number_of_page + "\n")
return br
def get_page(url):
br = start_browser_phjs()
# br = _start_br()
# br = _start_browser_ff()
br.get(url)
try:
br.wait.until(EC.presence_of_element_located((By.ID, 'ctl00_ContentPlaceHolder1_lblMaxDate')))
last_date_available_string = br.find_element_by_id("ctl00_ContentPlaceHolder1_lblMaxDate").text
except:
try:
last_date_available_string = br.find_element_by_id("ctl00_ContentPlaceHolder1_lblMaxDate").text
except:
br.get_screenshot_as_file("error.png")
br.close()
exit(1)
last_date_available = re.search(r"[\d]{4}-[\d]{2}-[\d]{2}", str(last_date_available_string))
if last_date_available:
# print(last_date_available.group())
date = datetime.datetime.strptime(last_date_available.group(), "%Y-%m-%d").date()
d = date.strftime("%m/%d/%y")
# print("date:", d)
# d = "12/15/2015"
date_from = br.find_element_by_id("ctl00_ContentPlaceHolder1_txtDateFrom").send_keys(d)
date_to = br.find_element_by_id("ctl00_ContentPlaceHolder1_txtDateTo").send_keys(d)
br.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchN").click()
time.sleep(5)
table = br.find_element_by_id("itemPlaceholderContainer")
rows = table.find_elements_by_xpath('//tbody/tr[contains(@class, "even") or contains(@class, "odd")]')
# print("len rows:", len(rows))
case_numbers = []
for row in rows:
case_number = row.find_element_by_xpath("td/a").text
case_numbers.append(case_number)
# print(case_number)
br = get_case_table_info(case_numbers, br)
# else:
# print(last_date_available_string)
br.close()
def main():
url = "http://www.cclerk.hctx.net/applications/websearch/Probate.aspx"
get_page(url)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment