-
-
Save freestok/2caed0778f3552a8804431fa6e0efbdf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import pickle | |
import re | |
import requests | |
import time | |
from datetime import date, timedelta | |
from os.path import join | |
from pathlib import Path | |
from multiprocessing import Pool | |
import numpy as np | |
import pandas as pd | |
import geopandas as gpd | |
from bs4 import BeautifulSoup | |
from fuzzywuzzy import process | |
from scrape_csv import scrape_accela | |
def open_closed(status): | |
if 'Closed' in status: return False | |
else: return True | |
def update_open(case_num): | |
url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={case_num}' | |
page = requests.get(url) | |
soup = BeautifulSoup(page.text) | |
print(case_num) | |
try: | |
status = soup.find(id='ctl00_PlaceHolderMain_lblRecordStatus').text | |
return status | |
except Exception as e: | |
print(e) | |
table = soup.find(id='ctl00_PlaceHolderMain_CapView_gdvPermitList') | |
tr = table.find_all('tr') | |
for row in tr: | |
cells = row.find_all(['td'],recursive=False) | |
if len(cells) > 1: | |
if cells[1].text.strip() == case_num: | |
print(cells[-1].text.strip()) | |
return cells[-1].text.strip() | |
def filter_case_type(df): | |
types = ["Backyard Chicken Complaint", "Building Complaint", | |
"Commercial Building Maintenance Complaint", "Dead Tree on Private Property", | |
"House Demolition", "Housing Blight", "Housing Complaint", | |
"Housing Multi-Family Rental Certification", "Housing Single-Family Rental", | |
"Housing Two-Family Rental", "Inoperable Vehicle on Private Property", | |
"Refuse Complaint", "Sidewalk Snow Removal Complaint", | |
"Tall Grass or Trash in Yard", "Zoning Complaint"] | |
df = df[df['Case Type'].isin(types)] | |
df = df[~df['Case Number'].str.contains('TMP')] | |
return df | |
def rearrange(r): | |
splt = r.split(' ') | |
direc = splt[1] | |
splt.remove(direc) | |
splt.append(direc) | |
return ' '.join(splt) | |
def st_name(r): | |
d = {'1ST':'FIRST',' 2ND':' SECOND',' 3RD':' THIRD',' 4TH':' FOURTH', | |
'5TH':'FIFTH',' 6TH':' SIXTH','7TH':'SEVENTH',' 8TH':' EIGHTH', | |
'9TH':'NINTH','10TH':'TENTH','11TH':'ELEVENTH','12TH':'TWELFTH'} | |
for k,v in d.items(): | |
r = r.replace(k,v) | |
return r | |
def format_search(r): | |
if ',' not in r: | |
addr_split = r.strip().split() | |
address_search = '+'.join(addr_split) | |
request_url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={address_search}' | |
return (r, request_url) | |
row = r.replace(',','') | |
row = row.replace('United States','') | |
addr_split = row.strip().split() | |
#combine longer zip codes with dash | |
if len(addr_split[-1]) == 9: | |
addr_split[-1] = addr_split[-1][:5] + '-' + addr_split[-1][5:] | |
# get rid of unit numbers in address, it's between GRAND and stree direction | |
sfx_match = [i for i in addr_split if i in suffixes] | |
if sfx_match: | |
sfx_index = addr_split.index(sfx_match[0]) | |
if addr_split[sfx_index + 1].upper() != 'GRAND': | |
del addr_split[sfx_index+1] | |
address_search = '+'.join(addr_split) | |
request_url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={address_search}' | |
return (r, request_url) | |
def get_pnum(r): | |
try: | |
page = requests.get(r) | |
return (r, page) | |
except Exception as e: | |
print('-----------ERROR------------') | |
print(e) | |
time.sleep(5) | |
try: | |
page = requests.get(r) | |
return (r, page) | |
except Exception as e: | |
print('2nd error') | |
print(e) | |
return np.nan | |
def get_text(r): | |
search = parcel_regex.findall(r.text) | |
parcel = list(set(search)) | |
return parcel | |
def multi_pnum(series): return series.apply(lambda r: get_pnum(r)) | |
def multi_update_open(series): return series.apply(lambda r: update_open(r)) | |
def format_df(df): | |
# drop if not in types we want | |
print('Formatting dataframe') | |
df = filter_case_type(df) | |
df.Address = df.Address.apply(lambda r: r.split(',')[0].strip()) # only st name, not GR and ZIP info | |
df.Address = df.Address.apply(lambda r: rearrange(r)) # put st. direction at the end of string | |
df.Address = df.Address.apply(lambda r: st_name(r)) # change 1st to FIRST, etc. | |
return df | |
def start_multiprocess(addresses): | |
start = time.time() | |
# split parcels_master by number of cores I want to use on process | |
num_cores = 10 # computer has 12 | |
parcels_split = np.array_split(addresses,num_cores) | |
# scrape Grand Rapids' Accela Citizen Access | |
with Pool(processes=num_cores) as pool: | |
data_list = pool.map(multi_pnum,parcels_split) | |
end = time.time() | |
print((end - start)) | |
return data_list | |
def extract_pnum(r): | |
if r: | |
if len(r) == 1: | |
return r[0] | |
def miss_match(df, gdf): | |
# find matches | |
print('Find parcel number matches and splitting between parcels that do & do not have matches') | |
merge = df.merge(gdf[['Address','PNUM']],on='Address',how='left') | |
merge = merge.drop_duplicates(subset=['Case Number']) | |
missing = merge.loc[merge.PNUM.isna()] | |
matching = merge.loc[~merge.PNUM.isna()] | |
return (missing, matching) | |
def merge_queries(res, query, missing): | |
query_merge = query.merge(res[['pnum','request_url']],on='request_url',how='left') | |
query_merge.columns = ['Address_y','request_url','pnum'] | |
missing_merge = missing.merge(query_merge[['Address_y','pnum']],on='Address_y',how='left') | |
missing_merge.PNUM = missing_merge.pnum.apply(lambda r: extract_pnum(r)) | |
missing_merge.drop(['Address_y','pnum'],axis=1,inplace=True) | |
missing_merge.columns = ['Date','Case Number','Case Type','Address','Description','Status','PNUM'] | |
return missing_merge | |
def main(): | |
csv_file = scrape_accela(last_run, yesterday_str) | |
master = pd.read_csv('scrape_final.csv') | |
raw_df = pd.read_csv(csv_file, usecols=['Date','Case Number','Case Type','Address','Description','Status']) | |
df = format_df(raw_df) | |
if df.empty: | |
return None | |
gdf = gpd.read_file(r"parcels\centroid\parcel_points.geojson") | |
missing, matching = miss_match(df, gdf) | |
if missing.empty: | |
merge = pd.concat([matching, master]) | |
else: | |
# format missing pnums dataframe to have old addresses for better searching | |
print('Reverting addresses to old format where there was no match for parcel numbers') | |
missing = missing.merge(raw_df[['Case Number','Address']],on='Case Number',how='left') | |
missing_list = missing.Address_y.drop_duplicates(keep='first') | |
print('Making column for URL query') | |
queries = missing_list.apply(lambda r: format_search(r)) | |
query_df = pd.DataFrame([i for i in queries],columns=['address','request_url']) | |
print('Making requests to Accela to find missing parcel numbers') | |
start = time.time() | |
# res = start_multiprocess(query_df.request_url) | |
res = query_df.request_url.apply(lambda r: get_pnum(r)) | |
end = time.time() | |
print((end - start)) | |
# res_df = pd.DataFrame([j for i in res for j in i],columns=['request_url','response']) | |
res_df = pd.DataFrame([i for i in res],columns=['request_url','response']) | |
# extract text | |
res_df['pnum'] = res_df.response.apply(lambda r: get_text(r)) | |
print('populating parcel numbers from the new dataframe') | |
missing_merge = merge_queries(res_df, query_df, missing) | |
# put back together | |
print('Concatenating the dataframes that were split earlier') | |
final_df = pd.concat([missing_merge,matching]) | |
final_df = final_df.dropna(subset=['PNUM']) | |
# merge with master | |
merge = pd.concat([master,final_df]) | |
# keep most recent duplicate | |
merge.drop_duplicates(subset=['Case Number'], keep='last', inplace=True) | |
merge['Open'] = merge.Status.apply(lambda r: open_closed(r)) | |
# update open cases | |
cases_open = merge.loc[merge.Open == True] | |
cases_close = merge.loc[merge.Open == False] | |
cases_open['Status'] = cases_open['Case Number'].apply(lambda r: update_open(r)) | |
cases_open['Open'] = cases_open.Status.apply(lambda r: open_closed(r)) | |
merge = pd.concat([cases_close,cases_open]) | |
merge.to_csv('scrape_final.csv',index=False) | |
# merge with parcels | |
cases = gdf.merge(merge, on='PNUM') | |
cases = cases[['Case Number','Case Type','Date','Description','Status','Open','geometry']] | |
cases.to_file('cases.geojson',driver='GeoJSON') | |
cases_stripped = cases[['Case Type','Status','Open','Date','geometry']] | |
cases_stripped.to_file('cases_stripped.geojson',driver='GeoJSON') | |
# get unique choices | |
unique_types = cases['Case Type'].unique() | |
choice_dict = {} | |
for i in unique_types: | |
df = cases.loc[cases['Case Type'] == i] | |
unique_status = list(df.Status.unique()) | |
choice_dict[i] = unique_status | |
with open('choices.json','w') as f: | |
json.dump(choice_dict,f) | |
if __name__ == '__main__': | |
parcel_regex = re.compile(r'41-\d{2}-\d{2}-\d{3}-\d{3}') | |
suffixes = ['ALY', 'ANX', 'ARC', 'AVE', 'YU', 'BCH', 'BND', 'BLF', 'BTM', 'BLVD', 'BR', 'BRG', 'BRK', 'BG', 'BYP', | |
'CP', 'CYN', 'CPE', 'CSWY', 'CTR', 'CIR', 'CLFS', 'CLB', 'COR', 'CORS', 'CRSE', 'CT', 'CTS', 'CV', | |
'CRK', 'CRES', 'XING', 'DL', 'DM', 'DV', 'DR', 'EST', 'EXPY', 'EXT', 'FALL', 'FLS', 'FRY', 'FLD', | |
'FLDS', 'FLT', 'FOR', 'FRST', 'FGR', 'FORK', 'FRKS', 'FT', 'FWY', 'GDNS', 'GTWY', 'GLN', 'GN', 'GRV', | |
'HBR', 'HVN', 'HTS', 'HWY', 'HL', 'HLS', 'HOLW', 'INLT', 'IS', 'ISS', 'ISLE', 'JCT', 'CY', 'KNLS', 'LK', | |
'LKS', 'LNDG', 'LN', 'LGT', 'LF', 'LCKS', 'LDG', 'LOOP', 'MALL', 'MNR', 'MDWS', 'ML', 'MLS', 'MSN', 'MT', | |
'MTN', 'NCK', 'ORCH', 'OVAL', 'PARK', 'PKY', 'PASS', 'PATH', 'PIKE', 'PNES', 'PL', 'PLN', 'PLNS', 'PLZ', | |
'PT', 'PRT', 'PR', 'RADL', 'RNCH', 'RPDS', 'RST', 'RDG', 'RIV', 'RD', 'ROW', 'RUN', 'SHL', 'SHLS', 'SHR', | |
'SHRS', 'SPG', 'SPGS', 'SPUR', 'SQ', 'STA', 'STRA', 'STRM', 'ST', 'SMT', 'TER', 'TRCE', 'TRAK', 'TRL', | |
'TRLR', 'TUNL', 'TPKE', 'UN', 'VLY', 'VIA', 'VW', 'VLG', 'VL', 'VIS', 'WALK', 'WAY', 'WLS'] | |
with open('last-run.pkl','rb') as f: | |
last_run = pickle.load(f) | |
yesterday = date.today() - timedelta(days=1) | |
yesterday_str = date.strftime(yesterday, '%m%d%Y') | |
main() | |
with open('last-run.pkl','wb') as f: | |
today_str = date.strftime(date.today(),'%m%d%Y') | |
pickle.dump(today_str,f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment