Created
January 15, 2016 14:14
-
-
Save podolskyi/d3fbba049b184adb3a08 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
import random | |
import time | |
import re | |
import time | |
import datetime | |
from selenium import webdriver | |
from user_agent import generate_user_agent | |
# from pyvirtualdisplay import Display | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
from selenium.common.exceptions import NoSuchElementException | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
def start_browser_phjs(): | |
try: | |
user_agent = generate_user_agent() | |
dcap = dict(DesiredCapabilities.PHANTOMJS) | |
dcap["phantomjs.page.settings.userAgent"] = user_agent | |
browser = webdriver.PhantomJS(desired_capabilities=dcap) | |
browser.set_window_size(1280, 854) | |
browser.implicitly_wait(10) | |
browser.wait = WebDriverWait(browser, 10) | |
browser.set_page_load_timeout(60) | |
except: | |
return False | |
return browser | |
def _start_browser_ff(): | |
"""Run browser and connect proxy""" | |
display = Display(visible=0, size=(1280, 854)) | |
display.start() | |
profile = webdriver.FirefoxProfile() | |
user_agent = generate_user_agent() | |
profile.set_preference("general.useragent.override", user_agent) | |
br = webdriver.Firefox(profile) | |
br.implicitly_wait(10) | |
br.set_page_load_timeout(60) | |
return br | |
def _start_br(): | |
display = Display(visible=0, size=(1280, 854)) | |
display.start() | |
chrome_options = webdriver.ChromeOptions() | |
chrome_options.add_argument("--proxy-server=http://{0}".format("159.203.120.202:8888")) | |
chrome_options.add_argument("--accept-language=en-US;q=0.8,en;q=0.4") | |
chrome_options.add_argument("--Referer=https://www.owler.com") | |
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,/" | |
"like Gecko) Chrome/46.0.2490.71 Safari/537.36") | |
browser = webdriver.Chrome(chrome_options=chrome_options) | |
return browser | |
def get_case_table_info(case_numbers, br): | |
today = datetime.date.today() | |
# today = datetime.datetime.now() | |
pbtcases = open("PBTCases_" + str(today) + ".txt", "w") | |
pbtcases.write("CaseNumber|FileDate|Nature of Proceeding|CourtNo|Status" + '\n') | |
pbtparties = open("PBTParties_" + str(today) + ".txt", "w") | |
pbtparties.write("CaseNumber|PartyRole|PName|PAddr|PAddr2|AName|AAddr|AAddr2|AttyPhone" + "\n") | |
pbtevents = open("PBTEvents_" + str(today) + ".txt", "w") | |
pbtevents.write("CaseNumber|DocumentID|FileDate|EventDesc|Comments|Number of Pages" + "\n") | |
for i, case_number in enumerate(case_numbers): | |
br.find_element_by_id("ctl00_ContentPlaceHolder1_txtCaseNo").clear() | |
br.find_element_by_id("ctl00_ContentPlaceHolder1_txtCaseNo").send_keys(case_number) | |
br.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearch").click() | |
case_table = br.find_element_by_id("ctl00_ContentPlaceHolder1_GridViewTitle") | |
# case = br.find_element_by_id("ctl00_ContentPlaceHolder1_GridViewTitle").find_elements_by_xpath("tbody/tr")[1] | |
case = case_table.find_elements_by_xpath("tbody/tr")[1].find_elements_by_xpath("td") | |
case_number = case_number | |
file_date = case[2].text.strip() | |
nature_of_proceeding = case[5].text.strip() | |
court_no = case[1].text.strip() | |
status = case[4].text.strip() | |
print("Info CN:", case_number) | |
print("-------------------------------------------------------------------------------------") | |
print(case_number, "|", file_date, "|", nature_of_proceeding, "|", court_no, "|", status) | |
print("-------------------------------------------------------------------------------------") | |
pbtcases.write(case_number + "|" + file_date + "|" + nature_of_proceeding + "|" + court_no + "|" + status + '\n') | |
br = get_parties(case[-1], br, pbtparties) | |
events_link = br.find_element_by_xpath('//table[@id="ctl00_ContentPlaceHolder1_GridViewTitle"]/tbody/tr/td') | |
br = get_events(events_link, br, pbtevents) | |
# print("=" * 10) | |
pbtcases.close() | |
pbtparties.close() | |
pbtevents.close() | |
return br | |
def get_parties(parties_link, br, pbtparties): | |
try: | |
parties_link.find_element_by_xpath("a").click() | |
except NoSuchElementException: | |
print("No parties") | |
return br | |
try: | |
parties_table = br.find_element_by_id("ctl00_ContentPlaceHolder1_GridViewParties2") | |
except NoSuchElementException: | |
print("No parties") | |
return br | |
parties = parties_table.find_elements_by_xpath("tbody/tr") | |
if parties: | |
parties = parties[1:] | |
print("----====Parties====----") | |
for i, row in enumerate(parties): | |
partie = row.find_elements_by_xpath("td") | |
case_number = partie[0].text.strip() | |
partyrole = partie[1].text.strip() | |
pname = partie[2].find_element_by_id( | |
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblPName").text | |
paddr = partie[2].find_element_by_id( | |
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblPAddr").text | |
paddr2 = partie[2].find_element_by_id( | |
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblPAddr2").text | |
aname = partie[3].find_element_by_id( | |
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblAName").text | |
aaddr = partie[3].find_element_by_id( | |
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblAAddr").text | |
aaddr2 = partie[3].find_element_by_id( | |
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblAAddr2").text | |
attyphone = partie[3].find_element_by_id( | |
"ctl00_ContentPlaceHolder1_GridViewParties2_ctl0" + str(i+2) + "_lblAttyPhone").text | |
print("------------------------------------------------------------------------------------------------------") | |
print(case_number, "|", partyrole, "|", pname, "|", paddr, "|", paddr2, "|", aname, "|", aaddr, "|" | |
, aaddr2, "|", attyphone) | |
print("------------------------------------------------------------------------------------------------------") | |
pbtparties.write(case_number + "|" + partyrole + "|" + pname + "|" + paddr + "|" + paddr2 + "|" + aname + | |
"|" + aaddr + "|" + aaddr2 + "|" + attyphone + "\n") | |
return br | |
def get_events(events_link, br, pbtevents): | |
try: | |
events_link.find_element_by_xpath("a").click() | |
except NoSuchElementException: | |
return br | |
try: | |
events_table = br.find_element_by_id("itemPlaceholderContainer") | |
except NoSuchElementException: | |
# print("No events") | |
return br | |
events = events_table.find_elements_by_xpath("tbody/tr") | |
print("----====Events====----") | |
for row in events: | |
event = row.find_elements_by_xpath("td") | |
case_number = event[0].find_element_by_xpath("span").text | |
document_id = event[5].find_element_by_xpath("a").text | |
file_date = event[1].find_element_by_xpath("span").text | |
event_desc = event[2].find_element_by_xpath("span").text | |
comments = event[3].find_element_by_xpath("span").text | |
number_of_page = event[4].find_element_by_xpath("a").text | |
print("------------------------------------------------------------------------------------------------------") | |
print(case_number, "|", document_id, "|", file_date, "|", event_desc, "|", comments, "|", number_of_page) | |
print("------------------------------------------------------------------------------------------------------") | |
pbtevents.write(case_number + "|" + document_id + "|" + file_date + "|" + | |
event_desc + "|" + comments + "|" + number_of_page + "\n") | |
return br | |
def get_page(url): | |
br = start_browser_phjs() | |
# br = _start_br() | |
# br = _start_browser_ff() | |
br.get(url) | |
try: | |
br.wait.until(EC.presence_of_element_located((By.ID, 'ctl00_ContentPlaceHolder1_lblMaxDate'))) | |
last_date_available_string = br.find_element_by_id("ctl00_ContentPlaceHolder1_lblMaxDate").text | |
except: | |
try: | |
last_date_available_string = br.find_element_by_id("ctl00_ContentPlaceHolder1_lblMaxDate").text | |
except: | |
br.get_screenshot_as_file("error.png") | |
br.close() | |
exit(1) | |
last_date_available = re.search(r"[\d]{4}-[\d]{2}-[\d]{2}", str(last_date_available_string)) | |
if last_date_available: | |
# print(last_date_available.group()) | |
date = datetime.datetime.strptime(last_date_available.group(), "%Y-%m-%d").date() | |
d = date.strftime("%m/%d/%y") | |
# print("date:", d) | |
# d = "12/15/2015" | |
date_from = br.find_element_by_id("ctl00_ContentPlaceHolder1_txtDateFrom").send_keys(d) | |
date_to = br.find_element_by_id("ctl00_ContentPlaceHolder1_txtDateTo").send_keys(d) | |
br.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearchN").click() | |
time.sleep(5) | |
table = br.find_element_by_id("itemPlaceholderContainer") | |
rows = table.find_elements_by_xpath('//tbody/tr[contains(@class, "even") or contains(@class, "odd")]') | |
# print("len rows:", len(rows)) | |
case_numbers = [] | |
for row in rows: | |
case_number = row.find_element_by_xpath("td/a").text | |
case_numbers.append(case_number) | |
# print(case_number) | |
br = get_case_table_info(case_numbers, br) | |
# else: | |
# print(last_date_available_string) | |
br.close() | |
def main(): | |
url = "http://www.cclerk.hctx.net/applications/websearch/Probate.aspx" | |
get_page(url) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment