-
-
Save pbaylis/0a31fa51108a1ab9f464 to your computer and use it in GitHub Desktop.
Unclaimed property scraping code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### gets table from https://ucpi.sco.ca.gov/ucp/... | |
### outputs csv | |
### There are two types (that I know of so far): Property and Notice | |
### - Property: https://ucpi.sco.ca.gov/ucp/PropertyDetails.aspx?propertyRecID=8162422 | |
### - Notice: https://ucpi.sco.ca.gov/ucp/NoticeDetails.aspx?propertyRecID=2033380 | |
# takes required arguments N/P, min, max. ex: | |
# - python scraping.py N 1 1000 | |
# - python scraping.py P 500 2000 | |
import sys | |
import csv | |
import time | |
import urllib2 | |
import datetime | |
from bs4 import BeautifulSoup | |
from geopy import geocoders | |
def main(): | |
if len(sys.argv) < 4: | |
print("Insufficient arguments. Proper example: \"python scraping.py P 500 2000\"") | |
sys.exit() | |
print("Starting...") | |
# get required parameters from call | |
NParg = sys.argv[1] | |
start = int(sys.argv[2]) | |
end = int(sys.argv[3]) | |
sleep = 0 # how much to wait in between calls | |
if NParg == "N": | |
getNotices(start, end, sleep) | |
if NParg == "P": | |
getProperties(start, end, sleep) | |
print("Done!") | |
def getProperties(start, end, sleep = 0): | |
print("Getting Properties...") | |
propertyList = [["propID", "ownerName", "ownerAdd", "propType", "cashRep", "sharesRep", "nameSec", "repBy", "newAdd", "lat", "lng", "dateRetrieved", "URL"]] | |
todaysdate = datetime.datetime.now().strftime("%Y-%m-%d") | |
lastGood = start | |
for num in range(start,end+1): | |
# 4/28/2014 - search on propertyID, not propertyRecID. Not dynamic. | |
# https://ucpi.sco.ca.gov/ucp/PropertyDetails.aspx?propertyID=001331061 | |
# 9 digit numbers, needs to be padded with 0s | |
url = "https://ucpi.sco.ca.gov/ucp/PropertyDetails.aspx?propertyID=" + str(num).zfill(9) | |
print(str(num - start + 1) + "/" + str(end - start + 1) + ": Querying " + url) | |
try: | |
property = processProperty(url) + [todaysdate] + [url] | |
propertyList.append(property) | |
lastGood = num # if this worked, save it as the last Good | |
except Exception: | |
print("Error processing " + str(url) + "!") | |
pass | |
if num > lastGood + 100: # if there were 100 straight errors | |
print("100 straight errors. Quitting now.") | |
break | |
time.sleep(sleep) | |
outputCSV(propertyList,"properties_" + str(start) + "_" + str(lastGood)) | |
def processProperty(requestURL): | |
response = urllib2.urlopen(requestURL) | |
responseHTML = response.read() | |
soup = BeautifulSoup(responseHTML) | |
# get property ID number | |
propID = soup.find('table', id="tbl_HeaderInformation").findAll('span')[2].contents[0].encode('ascii', 'ignore').strip() | |
PropertyDetailsTable = soup.find('table', id="PropertyDetailsTable") | |
PropertyDetailsList = getListFromTable(PropertyDetailsTable) | |
# They add two fields (sharesRep and nameSec) if shares were reported | |
if PropertyDetailsTable.find('tr', id="ctl00_ContentPlaceHolder1_trNumberOfShares") is None: | |
PropertyDetailsList.insert(4, "") | |
PropertyDetailsList.insert(4, "") | |
# get latlon and better address from address | |
geogList = getGeog(PropertyDetailsList[1]) | |
propertyRow = [propID] + PropertyDetailsList + geogList | |
return(propertyRow) | |
def getListFromTable(table): | |
# processes a table where we want the second column of every row, returns list | |
rows = table.findAll('tr') | |
# get the second column of each row - that's our data | |
outputList = [] | |
for row in rows: | |
col = row.findAll('td')[1].contents | |
if isinstance(col, list): # if it's a list, fix to string | |
col = fixList(col) | |
else: | |
col = col[0].encode('ascii', 'ignore') # otherwise, just get string | |
outputList.append(col.strip()) | |
return(outputList) | |
def getNotices(start, end, sleep = 0): # processes Notices from start to end, saves CSV | |
print("Getting Notices...") | |
noticeList = [["bizContact", "propType", "cashRep", "sharesRep", "nameSec", "dateRep", "dateCont", "ownerName", "ownerAdd", "dateRetrieved", "URL"]] | |
todaysdate = datetime.datetime.now().strftime("%Y-%m-%d") | |
for num in range(start,end+1): | |
url = "https://ucpi.sco.ca.gov/ucp/NoticeDetails.aspx?propertyRecID=" + str(num) | |
print(str(num - start + 1) + "/" + str(end - start + 1) + ": Querying " + url) | |
try: | |
notice = processNotice(url) + [todaysdate] + [url] | |
noticeList.append(notice) | |
lastGood = num # if this worked, save it as the last Good pull | |
except AttributeError: | |
print("Error processing " + str(url) + "!") | |
pass | |
if num > lastGood + 1000: # if there were 1000 straight errors | |
print("1000 straight errors. Quitting now.") | |
break | |
time.sleep(sleep) | |
outputCSV(noticeList,"notices_" + str(start) + "_" + str(lastGood)) | |
def processNotice(requestURL): # given a URL, retrieves a single webpage and returns a list | |
response = urllib2.urlopen(requestURL) | |
responseHTML = response.read() | |
soup = BeautifulSoup(responseHTML) | |
# process the holder name | |
Holder = fixList(soup.find('td', id="HolderNameData").contents) | |
# process the property details table | |
PropertyDetailsTable = soup.find('table', id="PropertyDetailsTable") | |
PropertyDetailsRows = PropertyDetailsTable.findAll('tr') | |
# get the second column of each row - that's our data | |
PropertyDetailsList = [] | |
for row in PropertyDetailsRows: | |
col = row.findAll('td')[1].contents | |
if isinstance(col, list): # if it's a list, fix to string | |
col = fixList(col) | |
else: | |
col = col[0].encode('ascii', 'ignore') # otherwise, just get string | |
PropertyDetailsList.append(col.strip()) | |
noticeRow = [Holder] + PropertyDetailsList | |
return(noticeRow) | |
def getGeog(address): | |
# take in an address, return the address, (lat, lon) in a list | |
us = geocoders.GeocoderDotUS() | |
try: | |
place, (lat, lng) = us.geocode(address) | |
except TypeError: | |
print "Couldn't geocode address." | |
place, (lat, lng) = "", (0,0) | |
return list((place, lat, lng)) | |
def fixList(tagstrList): | |
# take in a list that includes strings and tags, return property formatted string and ignore tags | |
fixed = "" | |
for part in tagstrList: | |
if isinstance(part, unicode): | |
fixed = fixed + part.strip().encode('ascii', 'ignore') + "\n" | |
return(fixed.strip()) | |
def outputCSV(mylist, name): | |
writer=csv.writer(file("data/" + name + '.csv','wb'),dialect='excel') | |
writer.writerows(mylist) | |
print("Wrote " + name + '.csv') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment