Skip to content

Instantly share code, notes, and snippets.

@freestok
Last active September 4, 2020 15:56
Show Gist options
  • Save freestok/2caed0778f3552a8804431fa6e0efbdf to your computer and use it in GitHub Desktop.
Save freestok/2caed0778f3552a8804431fa6e0efbdf to your computer and use it in GitHub Desktop.
import os
import json
import pickle
import re
import requests
import time
from datetime import date, timedelta
from os.path import join
from pathlib import Path
from multiprocessing import Pool
import numpy as np
import pandas as pd
import geopandas as gpd
from bs4 import BeautifulSoup
from fuzzywuzzy import process
from scrape_csv import scrape_accela
def open_closed(status):
if 'Closed' in status: return False
else: return True
def update_open(case_num):
url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={case_num}'
page = requests.get(url)
soup = BeautifulSoup(page.text)
print(case_num)
try:
status = soup.find(id='ctl00_PlaceHolderMain_lblRecordStatus').text
return status
except Exception as e:
print(e)
table = soup.find(id='ctl00_PlaceHolderMain_CapView_gdvPermitList')
tr = table.find_all('tr')
for row in tr:
cells = row.find_all(['td'],recursive=False)
if len(cells) > 1:
if cells[1].text.strip() == case_num:
print(cells[-1].text.strip())
return cells[-1].text.strip()
def filter_case_type(df):
types = ["Backyard Chicken Complaint", "Building Complaint",
"Commercial Building Maintenance Complaint", "Dead Tree on Private Property",
"House Demolition", "Housing Blight", "Housing Complaint",
"Housing Multi-Family Rental Certification", "Housing Single-Family Rental",
"Housing Two-Family Rental", "Inoperable Vehicle on Private Property",
"Refuse Complaint", "Sidewalk Snow Removal Complaint",
"Tall Grass or Trash in Yard", "Zoning Complaint"]
df = df[df['Case Type'].isin(types)]
df = df[~df['Case Number'].str.contains('TMP')]
return df
def rearrange(r):
splt = r.split(' ')
direc = splt[1]
splt.remove(direc)
splt.append(direc)
return ' '.join(splt)
def st_name(r):
d = {'1ST':'FIRST',' 2ND':' SECOND',' 3RD':' THIRD',' 4TH':' FOURTH',
'5TH':'FIFTH',' 6TH':' SIXTH','7TH':'SEVENTH',' 8TH':' EIGHTH',
'9TH':'NINTH','10TH':'TENTH','11TH':'ELEVENTH','12TH':'TWELFTH'}
for k,v in d.items():
r = r.replace(k,v)
return r
def format_search(r):
if ',' not in r:
addr_split = r.strip().split()
address_search = '+'.join(addr_split)
request_url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={address_search}'
return (r, request_url)
row = r.replace(',','')
row = row.replace('United States','')
addr_split = row.strip().split()
#combine longer zip codes with dash
if len(addr_split[-1]) == 9:
addr_split[-1] = addr_split[-1][:5] + '-' + addr_split[-1][5:]
# get rid of unit numbers in address, it's between GRAND and stree direction
sfx_match = [i for i in addr_split if i in suffixes]
if sfx_match:
sfx_index = addr_split.index(sfx_match[0])
if addr_split[sfx_index + 1].upper() != 'GRAND':
del addr_split[sfx_index+1]
address_search = '+'.join(addr_split)
request_url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={address_search}'
return (r, request_url)
def get_pnum(r):
try:
page = requests.get(r)
return (r, page)
except Exception as e:
print('-----------ERROR------------')
print(e)
time.sleep(5)
try:
page = requests.get(r)
return (r, page)
except Exception as e:
print('2nd error')
print(e)
return np.nan
def get_text(r):
search = parcel_regex.findall(r.text)
parcel = list(set(search))
return parcel
def multi_pnum(series): return series.apply(lambda r: get_pnum(r))
def multi_update_open(series): return series.apply(lambda r: update_open(r))
def format_df(df):
# drop if not in types we want
print('Formatting dataframe')
df = filter_case_type(df)
df.Address = df.Address.apply(lambda r: r.split(',')[0].strip()) # only st name, not GR and ZIP info
df.Address = df.Address.apply(lambda r: rearrange(r)) # put st. direction at the end of string
df.Address = df.Address.apply(lambda r: st_name(r)) # change 1st to FIRST, etc.
return df
def start_multiprocess(addresses):
start = time.time()
# split parcels_master by number of cores I want to use on process
num_cores = 10 # computer has 12
parcels_split = np.array_split(addresses,num_cores)
# scrape Grand Rapids' Accela Citizen Access
with Pool(processes=num_cores) as pool:
data_list = pool.map(multi_pnum,parcels_split)
end = time.time()
print((end - start))
return data_list
def extract_pnum(r):
if r:
if len(r) == 1:
return r[0]
def miss_match(df, gdf):
# find matches
print('Find parcel number matches and splitting between parcels that do & do not have matches')
merge = df.merge(gdf[['Address','PNUM']],on='Address',how='left')
merge = merge.drop_duplicates(subset=['Case Number'])
missing = merge.loc[merge.PNUM.isna()]
matching = merge.loc[~merge.PNUM.isna()]
return (missing, matching)
def merge_queries(res, query, missing):
query_merge = query.merge(res[['pnum','request_url']],on='request_url',how='left')
query_merge.columns = ['Address_y','request_url','pnum']
missing_merge = missing.merge(query_merge[['Address_y','pnum']],on='Address_y',how='left')
missing_merge.PNUM = missing_merge.pnum.apply(lambda r: extract_pnum(r))
missing_merge.drop(['Address_y','pnum'],axis=1,inplace=True)
missing_merge.columns = ['Date','Case Number','Case Type','Address','Description','Status','PNUM']
return missing_merge
def main():
csv_file = scrape_accela(last_run, yesterday_str)
master = pd.read_csv('scrape_final.csv')
raw_df = pd.read_csv(csv_file, usecols=['Date','Case Number','Case Type','Address','Description','Status'])
df = format_df(raw_df)
if df.empty:
return None
gdf = gpd.read_file(r"parcels\centroid\parcel_points.geojson")
missing, matching = miss_match(df, gdf)
if missing.empty:
merge = pd.concat([matching, master])
else:
# format missing pnums dataframe to have old addresses for better searching
print('Reverting addresses to old format where there was no match for parcel numbers')
missing = missing.merge(raw_df[['Case Number','Address']],on='Case Number',how='left')
missing_list = missing.Address_y.drop_duplicates(keep='first')
print('Making column for URL query')
queries = missing_list.apply(lambda r: format_search(r))
query_df = pd.DataFrame([i for i in queries],columns=['address','request_url'])
print('Making requests to Accela to find missing parcel numbers')
start = time.time()
# res = start_multiprocess(query_df.request_url)
res = query_df.request_url.apply(lambda r: get_pnum(r))
end = time.time()
print((end - start))
# res_df = pd.DataFrame([j for i in res for j in i],columns=['request_url','response'])
res_df = pd.DataFrame([i for i in res],columns=['request_url','response'])
# extract text
res_df['pnum'] = res_df.response.apply(lambda r: get_text(r))
print('populating parcel numbers from the new dataframe')
missing_merge = merge_queries(res_df, query_df, missing)
# put back together
print('Concatenating the dataframes that were split earlier')
final_df = pd.concat([missing_merge,matching])
final_df = final_df.dropna(subset=['PNUM'])
# merge with master
merge = pd.concat([master,final_df])
# keep most recent duplicate
merge.drop_duplicates(subset=['Case Number'], keep='last', inplace=True)
merge['Open'] = merge.Status.apply(lambda r: open_closed(r))
# update open cases
cases_open = merge.loc[merge.Open == True]
cases_close = merge.loc[merge.Open == False]
cases_open['Status'] = cases_open['Case Number'].apply(lambda r: update_open(r))
cases_open['Open'] = cases_open.Status.apply(lambda r: open_closed(r))
merge = pd.concat([cases_close,cases_open])
merge.to_csv('scrape_final.csv',index=False)
# merge with parcels
cases = gdf.merge(merge, on='PNUM')
cases = cases[['Case Number','Case Type','Date','Description','Status','Open','geometry']]
cases.to_file('cases.geojson',driver='GeoJSON')
cases_stripped = cases[['Case Type','Status','Open','Date','geometry']]
cases_stripped.to_file('cases_stripped.geojson',driver='GeoJSON')
# get unique choices
unique_types = cases['Case Type'].unique()
choice_dict = {}
for i in unique_types:
df = cases.loc[cases['Case Type'] == i]
unique_status = list(df.Status.unique())
choice_dict[i] = unique_status
with open('choices.json','w') as f:
json.dump(choice_dict,f)
if __name__ == '__main__':
parcel_regex = re.compile(r'41-\d{2}-\d{2}-\d{3}-\d{3}')
suffixes = ['ALY', 'ANX', 'ARC', 'AVE', 'YU', 'BCH', 'BND', 'BLF', 'BTM', 'BLVD', 'BR', 'BRG', 'BRK', 'BG', 'BYP',
'CP', 'CYN', 'CPE', 'CSWY', 'CTR', 'CIR', 'CLFS', 'CLB', 'COR', 'CORS', 'CRSE', 'CT', 'CTS', 'CV',
'CRK', 'CRES', 'XING', 'DL', 'DM', 'DV', 'DR', 'EST', 'EXPY', 'EXT', 'FALL', 'FLS', 'FRY', 'FLD',
'FLDS', 'FLT', 'FOR', 'FRST', 'FGR', 'FORK', 'FRKS', 'FT', 'FWY', 'GDNS', 'GTWY', 'GLN', 'GN', 'GRV',
'HBR', 'HVN', 'HTS', 'HWY', 'HL', 'HLS', 'HOLW', 'INLT', 'IS', 'ISS', 'ISLE', 'JCT', 'CY', 'KNLS', 'LK',
'LKS', 'LNDG', 'LN', 'LGT', 'LF', 'LCKS', 'LDG', 'LOOP', 'MALL', 'MNR', 'MDWS', 'ML', 'MLS', 'MSN', 'MT',
'MTN', 'NCK', 'ORCH', 'OVAL', 'PARK', 'PKY', 'PASS', 'PATH', 'PIKE', 'PNES', 'PL', 'PLN', 'PLNS', 'PLZ',
'PT', 'PRT', 'PR', 'RADL', 'RNCH', 'RPDS', 'RST', 'RDG', 'RIV', 'RD', 'ROW', 'RUN', 'SHL', 'SHLS', 'SHR',
'SHRS', 'SPG', 'SPGS', 'SPUR', 'SQ', 'STA', 'STRA', 'STRM', 'ST', 'SMT', 'TER', 'TRCE', 'TRAK', 'TRL',
'TRLR', 'TUNL', 'TPKE', 'UN', 'VLY', 'VIA', 'VW', 'VLG', 'VL', 'VIS', 'WALK', 'WAY', 'WLS']
with open('last-run.pkl','rb') as f:
last_run = pickle.load(f)
yesterday = date.today() - timedelta(days=1)
yesterday_str = date.strftime(yesterday, '%m%d%Y')
main()
with open('last-run.pkl','wb') as f:
today_str = date.strftime(date.today(),'%m%d%Y')
pickle.dump(today_str,f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment