freestok/scraping_combine.py Secret

## scraping_combine.py
import os
import json
import pickle
import re
import requests
import time
from datetime import date, timedelta
from os.path import join
from pathlib import Path
from multiprocessing import Pool

import numpy as np
import pandas as pd
import geopandas as gpd
from bs4 import BeautifulSoup
from fuzzywuzzy import process

from scrape_csv import scrape_accela

def open_closed(status):
    if 'Closed' in status: return False
    else:                  return True

def update_open(case_num):
    url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={case_num}'
    page = requests.get(url)
    soup = BeautifulSoup(page.text)
    print(case_num)
    try:
        status = soup.find(id='ctl00_PlaceHolderMain_lblRecordStatus').text
        return status
    except Exception as e:
        print(e)
        table = soup.find(id='ctl00_PlaceHolderMain_CapView_gdvPermitList')
        tr = table.find_all('tr')
        for row in tr:
            cells = row.find_all(['td'],recursive=False)
            if len(cells) > 1:
                if cells[1].text.strip() == case_num:
                    print(cells[-1].text.strip())
                    return cells[-1].text.strip()


def filter_case_type(df):
    types = ["Backyard Chicken Complaint", "Building Complaint",
            "Commercial Building Maintenance Complaint", "Dead Tree on Private Property",
            "House Demolition", "Housing Blight", "Housing Complaint",
            "Housing Multi-Family Rental Certification", "Housing Single-Family Rental",
            "Housing Two-Family Rental", "Inoperable Vehicle on Private Property",
            "Refuse Complaint", "Sidewalk Snow Removal Complaint",
            "Tall Grass or Trash in Yard",  "Zoning Complaint"]
    df = df[df['Case Type'].isin(types)]
    df = df[~df['Case Number'].str.contains('TMP')]
    return df

def rearrange(r):
    splt = r.split(' ')
    direc = splt[1]
    splt.remove(direc)
    splt.append(direc)
    return ' '.join(splt)

def st_name(r):
    d = {'1ST':'FIRST',' 2ND':' SECOND',' 3RD':' THIRD',' 4TH':' FOURTH',
         '5TH':'FIFTH',' 6TH':' SIXTH','7TH':'SEVENTH',' 8TH':' EIGHTH',
         '9TH':'NINTH','10TH':'TENTH','11TH':'ELEVENTH','12TH':'TWELFTH'}
    for k,v in d.items():
        r = r.replace(k,v)

    return r

def format_search(r):
    if ',' not in r:
        addr_split = r.strip().split()
        address_search = '+'.join(addr_split)
        request_url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={address_search}'
        return (r, request_url)

    row = r.replace(',','')
    row = row.replace('United States','')
    addr_split = row.strip().split()
    #combine longer zip codes with dash
    if len(addr_split[-1]) == 9:
        addr_split[-1] = addr_split[-1][:5] + '-' + addr_split[-1][5:]

    # get rid of unit numbers in address, it's between GRAND and stree direction
    sfx_match = [i for i in addr_split if i in suffixes]
    if sfx_match:
        sfx_index = addr_split.index(sfx_match[0])
        if addr_split[sfx_index + 1].upper() != 'GRAND':
            del addr_split[sfx_index+1]


    address_search = '+'.join(addr_split)
    request_url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={address_search}'
    return (r, request_url)

def get_pnum(r):
    try:
        page = requests.get(r)
        return (r, page)
    except Exception as e:
        print('-----------ERROR------------')
        print(e)
        time.sleep(5)
        try:
            page = requests.get(r)
            return (r, page)
        except Exception as e:
            print('2nd error')
            print(e)
            return np.nan

def get_text(r):
    search = parcel_regex.findall(r.text)
    parcel = list(set(search))
    return parcel

def multi_pnum(series): return series.apply(lambda r: get_pnum(r))
def multi_update_open(series): return series.apply(lambda r: update_open(r))

def format_df(df):
    # drop if not in types we want
    print('Formatting dataframe')
    df = filter_case_type(df)
    df.Address = df.Address.apply(lambda r: r.split(',')[0].strip()) # only st name, not GR and ZIP info
    df.Address = df.Address.apply(lambda r: rearrange(r)) # put st. direction at the end of string
    df.Address = df.Address.apply(lambda r: st_name(r)) # change 1st to FIRST, etc.
    return df

def start_multiprocess(addresses):
    start = time.time()

    # split parcels_master by number of cores I want to use on process
    num_cores = 10 # computer has 12
    parcels_split = np.array_split(addresses,num_cores)

    # scrape Grand Rapids' Accela Citizen Access
    with Pool(processes=num_cores) as pool:
        data_list = pool.map(multi_pnum,parcels_split)

    end = time.time()
    print((end - start))

    return data_list

def extract_pnum(r):
    if r:
        if len(r) == 1:
            return r[0]

def miss_match(df, gdf):
    # find matches
    print('Find parcel number matches and splitting between parcels that do & do not have matches')
    merge    = df.merge(gdf[['Address','PNUM']],on='Address',how='left')
    merge    = merge.drop_duplicates(subset=['Case Number'])
    missing  = merge.loc[merge.PNUM.isna()]
    matching = merge.loc[~merge.PNUM.isna()]
    return (missing, matching)

def merge_queries(res, query, missing):
    query_merge = query.merge(res[['pnum','request_url']],on='request_url',how='left')
    query_merge.columns = ['Address_y','request_url','pnum']
    missing_merge = missing.merge(query_merge[['Address_y','pnum']],on='Address_y',how='left')
    missing_merge.PNUM = missing_merge.pnum.apply(lambda r: extract_pnum(r))
    missing_merge.drop(['Address_y','pnum'],axis=1,inplace=True)
    missing_merge.columns = ['Date','Case Number','Case Type','Address','Description','Status','PNUM']
    return missing_merge

def main():
    csv_file = scrape_accela(last_run, yesterday_str)
    master = pd.read_csv('scrape_final.csv')
    raw_df = pd.read_csv(csv_file, usecols=['Date','Case Number','Case Type','Address','Description','Status'])
    df = format_df(raw_df)
    if df.empty:
        return None

    gdf = gpd.read_file(r"parcels\centroid\parcel_points.geojson")

    missing, matching = miss_match(df, gdf)

    if missing.empty:
        merge = pd.concat([matching, master])
    else:
        # format missing pnums dataframe to have old addresses for better searching
        print('Reverting addresses to old format where there was no match for parcel numbers')
        missing = missing.merge(raw_df[['Case Number','Address']],on='Case Number',how='left')
        missing_list = missing.Address_y.drop_duplicates(keep='first')

        print('Making column for URL query')
        queries = missing_list.apply(lambda r: format_search(r))
        query_df = pd.DataFrame([i for i in queries],columns=['address','request_url'])

        print('Making requests to Accela to find missing parcel numbers')
        start = time.time()
        # res = start_multiprocess(query_df.request_url)
        res = query_df.request_url.apply(lambda r: get_pnum(r))
        end = time.time()
        print((end - start))

        # res_df = pd.DataFrame([j for i in res for j in i],columns=['request_url','response'])
        res_df = pd.DataFrame([i for i in res],columns=['request_url','response'])

        # extract text
        res_df['pnum'] = res_df.response.apply(lambda r: get_text(r))

        print('populating parcel numbers from the new dataframe')
        missing_merge = merge_queries(res_df, query_df, missing)

        # put back together
        print('Concatenating the dataframes that were split earlier')
        final_df = pd.concat([missing_merge,matching])
        final_df = final_df.dropna(subset=['PNUM'])

        # merge with master
        merge = pd.concat([master,final_df])

    # keep most recent duplicate
    merge.drop_duplicates(subset=['Case Number'], keep='last', inplace=True)
    merge['Open'] = merge.Status.apply(lambda r: open_closed(r))

    # update open cases
    cases_open  = merge.loc[merge.Open == True]
    cases_close = merge.loc[merge.Open == False]
    cases_open['Status'] = cases_open['Case Number'].apply(lambda r: update_open(r))
    cases_open['Open'] = cases_open.Status.apply(lambda r: open_closed(r))
    merge = pd.concat([cases_close,cases_open])


    merge.to_csv('scrape_final.csv',index=False)

    # merge with parcels
    cases = gdf.merge(merge, on='PNUM')
    cases = cases[['Case Number','Case Type','Date','Description','Status','Open','geometry']]

    cases.to_file('cases.geojson',driver='GeoJSON')

    cases_stripped = cases[['Case Type','Status','Open','Date','geometry']]
    cases_stripped.to_file('cases_stripped.geojson',driver='GeoJSON')

    # get unique choices
    unique_types = cases['Case Type'].unique()
    choice_dict = {}
    for i in unique_types:
        df = cases.loc[cases['Case Type'] == i]
        unique_status = list(df.Status.unique())
        choice_dict[i] = unique_status

    with open('choices.json','w') as f:
        json.dump(choice_dict,f)


if __name__ == '__main__':
    parcel_regex = re.compile(r'41-\d{2}-\d{2}-\d{3}-\d{3}')
    suffixes = ['ALY', 'ANX', 'ARC', 'AVE', 'YU', 'BCH', 'BND', 'BLF', 'BTM', 'BLVD', 'BR', 'BRG', 'BRK', 'BG', 'BYP',
                'CP', 'CYN', 'CPE', 'CSWY', 'CTR', 'CIR', 'CLFS', 'CLB', 'COR', 'CORS', 'CRSE', 'CT', 'CTS', 'CV',
                'CRK', 'CRES', 'XING', 'DL', 'DM', 'DV', 'DR', 'EST', 'EXPY', 'EXT', 'FALL', 'FLS', 'FRY', 'FLD',
                'FLDS', 'FLT', 'FOR', 'FRST', 'FGR', 'FORK', 'FRKS', 'FT', 'FWY', 'GDNS', 'GTWY', 'GLN', 'GN', 'GRV',
                'HBR', 'HVN', 'HTS', 'HWY', 'HL', 'HLS', 'HOLW', 'INLT', 'IS', 'ISS', 'ISLE', 'JCT', 'CY', 'KNLS', 'LK',
                'LKS', 'LNDG', 'LN', 'LGT', 'LF', 'LCKS', 'LDG', 'LOOP', 'MALL', 'MNR', 'MDWS', 'ML', 'MLS', 'MSN', 'MT',
                'MTN', 'NCK', 'ORCH', 'OVAL', 'PARK', 'PKY', 'PASS', 'PATH', 'PIKE', 'PNES', 'PL', 'PLN', 'PLNS', 'PLZ',
                'PT', 'PRT', 'PR', 'RADL', 'RNCH', 'RPDS', 'RST', 'RDG', 'RIV', 'RD', 'ROW', 'RUN', 'SHL', 'SHLS', 'SHR',
                'SHRS', 'SPG', 'SPGS', 'SPUR', 'SQ', 'STA', 'STRA', 'STRM', 'ST', 'SMT', 'TER', 'TRCE', 'TRAK', 'TRL',
                'TRLR', 'TUNL', 'TPKE', 'UN', 'VLY', 'VIA', 'VW', 'VLG', 'VL', 'VIS', 'WALK', 'WAY', 'WLS']

    with open('last-run.pkl','rb') as f:
            last_run = pickle.load(f)

    yesterday = date.today() - timedelta(days=1)
    yesterday_str = date.strftime(yesterday, '%m%d%Y')

    main()

    with open('last-run.pkl','wb') as f:
        today_str = date.strftime(date.today(),'%m%d%Y')
        pickle.dump(today_str,f)
	import os
	import json
	import pickle
	import re
	import requests
	import time
	from datetime import date, timedelta
	from os.path import join
	from pathlib import Path
	from multiprocessing import Pool

	import numpy as np
	import pandas as pd
	import geopandas as gpd
	from bs4 import BeautifulSoup
	from fuzzywuzzy import process

	from scrape_csv import scrape_accela

	def open_closed(status):
	if 'Closed' in status: return False
	else: return True

	def update_open(case_num):
	url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={case_num}'
	page = requests.get(url)
	soup = BeautifulSoup(page.text)
	print(case_num)
	try:
	status = soup.find(id='ctl00_PlaceHolderMain_lblRecordStatus').text
	return status
	except Exception as e:
	print(e)
	table = soup.find(id='ctl00_PlaceHolderMain_CapView_gdvPermitList')
	tr = table.find_all('tr')
	for row in tr:
	cells = row.find_all(['td'],recursive=False)
	if len(cells) > 1:
	if cells[1].text.strip() == case_num:
	print(cells[-1].text.strip())
	return cells[-1].text.strip()


	def filter_case_type(df):
	types = ["Backyard Chicken Complaint", "Building Complaint",
	"Commercial Building Maintenance Complaint", "Dead Tree on Private Property",
	"House Demolition", "Housing Blight", "Housing Complaint",
	"Housing Multi-Family Rental Certification", "Housing Single-Family Rental",
	"Housing Two-Family Rental", "Inoperable Vehicle on Private Property",
	"Refuse Complaint", "Sidewalk Snow Removal Complaint",
	"Tall Grass or Trash in Yard", "Zoning Complaint"]
	df = df[df['Case Type'].isin(types)]
	df = df[~df['Case Number'].str.contains('TMP')]
	return df

	def rearrange(r):
	splt = r.split(' ')
	direc = splt[1]
	splt.remove(direc)
	splt.append(direc)
	return ' '.join(splt)

	def st_name(r):
	d = {'1ST':'FIRST',' 2ND':' SECOND',' 3RD':' THIRD',' 4TH':' FOURTH',
	'5TH':'FIFTH',' 6TH':' SIXTH','7TH':'SEVENTH',' 8TH':' EIGHTH',
	'9TH':'NINTH','10TH':'TENTH','11TH':'ELEVENTH','12TH':'TWELFTH'}
	for k,v in d.items():
	r = r.replace(k,v)

	return r

	def format_search(r):
	if ',' not in r:
	addr_split = r.strip().split()
	address_search = '+'.join(addr_split)
	request_url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={address_search}'
	return (r, request_url)

	row = r.replace(',','')
	row = row.replace('United States','')
	addr_split = row.strip().split()
	#combine longer zip codes with dash
	if len(addr_split[-1]) == 9:
	addr_split[-1] = addr_split[-1][:5] + '-' + addr_split[-1][5:]

	# get rid of unit numbers in address, it's between GRAND and stree direction
	sfx_match = [i for i in addr_split if i in suffixes]
	if sfx_match:
	sfx_index = addr_split.index(sfx_match[0])
	if addr_split[sfx_index + 1].upper() != 'GRAND':
	del addr_split[sfx_index+1]


	address_search = '+'.join(addr_split)
	request_url = f'https://inspections.grcity.us/CitizenAccess/Cap/GlobalSearchResults.aspx?QueryText={address_search}'
	return (r, request_url)

	def get_pnum(r):
	try:
	page = requests.get(r)
	return (r, page)
	except Exception as e:
	print('-----------ERROR------------')
	print(e)
	time.sleep(5)
	try:
	page = requests.get(r)
	return (r, page)
	except Exception as e:
	print('2nd error')
	print(e)
	return np.nan

	def get_text(r):
	search = parcel_regex.findall(r.text)
	parcel = list(set(search))
	return parcel

	def multi_pnum(series): return series.apply(lambda r: get_pnum(r))
	def multi_update_open(series): return series.apply(lambda r: update_open(r))

	def format_df(df):
	# drop if not in types we want
	print('Formatting dataframe')
	df = filter_case_type(df)
	df.Address = df.Address.apply(lambda r: r.split(',')[0].strip()) # only st name, not GR and ZIP info
	df.Address = df.Address.apply(lambda r: rearrange(r)) # put st. direction at the end of string
	df.Address = df.Address.apply(lambda r: st_name(r)) # change 1st to FIRST, etc.
	return df

	def start_multiprocess(addresses):
	start = time.time()

	# split parcels_master by number of cores I want to use on process
	num_cores = 10 # computer has 12
	parcels_split = np.array_split(addresses,num_cores)

	# scrape Grand Rapids' Accela Citizen Access
	with Pool(processes=num_cores) as pool:
	data_list = pool.map(multi_pnum,parcels_split)

	end = time.time()
	print((end - start))

	return data_list

	def extract_pnum(r):
	if r:
	if len(r) == 1:
	return r[0]

	def miss_match(df, gdf):
	# find matches
	print('Find parcel number matches and splitting between parcels that do & do not have matches')
	merge = df.merge(gdf[['Address','PNUM']],on='Address',how='left')
	merge = merge.drop_duplicates(subset=['Case Number'])
	missing = merge.loc[merge.PNUM.isna()]
	matching = merge.loc[~merge.PNUM.isna()]
	return (missing, matching)

	def merge_queries(res, query, missing):
	query_merge = query.merge(res[['pnum','request_url']],on='request_url',how='left')
	query_merge.columns = ['Address_y','request_url','pnum']
	missing_merge = missing.merge(query_merge[['Address_y','pnum']],on='Address_y',how='left')
	missing_merge.PNUM = missing_merge.pnum.apply(lambda r: extract_pnum(r))
	missing_merge.drop(['Address_y','pnum'],axis=1,inplace=True)
	missing_merge.columns = ['Date','Case Number','Case Type','Address','Description','Status','PNUM']
	return missing_merge

	def main():
	csv_file = scrape_accela(last_run, yesterday_str)
	master = pd.read_csv('scrape_final.csv')
	raw_df = pd.read_csv(csv_file, usecols=['Date','Case Number','Case Type','Address','Description','Status'])
	df = format_df(raw_df)
	if df.empty:
	return None

	gdf = gpd.read_file(r"parcels\centroid\parcel_points.geojson")

	missing, matching = miss_match(df, gdf)

	if missing.empty:
	merge = pd.concat([matching, master])
	else:
	# format missing pnums dataframe to have old addresses for better searching
	print('Reverting addresses to old format where there was no match for parcel numbers')
	missing = missing.merge(raw_df[['Case Number','Address']],on='Case Number',how='left')
	missing_list = missing.Address_y.drop_duplicates(keep='first')

	print('Making column for URL query')
	queries = missing_list.apply(lambda r: format_search(r))
	query_df = pd.DataFrame([i for i in queries],columns=['address','request_url'])

	print('Making requests to Accela to find missing parcel numbers')
	start = time.time()
	# res = start_multiprocess(query_df.request_url)
	res = query_df.request_url.apply(lambda r: get_pnum(r))
	end = time.time()
	print((end - start))

	# res_df = pd.DataFrame([j for i in res for j in i],columns=['request_url','response'])
	res_df = pd.DataFrame([i for i in res],columns=['request_url','response'])

	# extract text
	res_df['pnum'] = res_df.response.apply(lambda r: get_text(r))

	print('populating parcel numbers from the new dataframe')
	missing_merge = merge_queries(res_df, query_df, missing)

	# put back together
	print('Concatenating the dataframes that were split earlier')
	final_df = pd.concat([missing_merge,matching])
	final_df = final_df.dropna(subset=['PNUM'])

	# merge with master
	merge = pd.concat([master,final_df])

	# keep most recent duplicate
	merge.drop_duplicates(subset=['Case Number'], keep='last', inplace=True)
	merge['Open'] = merge.Status.apply(lambda r: open_closed(r))

	# update open cases
	cases_open = merge.loc[merge.Open == True]
	cases_close = merge.loc[merge.Open == False]
	cases_open['Status'] = cases_open['Case Number'].apply(lambda r: update_open(r))
	cases_open['Open'] = cases_open.Status.apply(lambda r: open_closed(r))
	merge = pd.concat([cases_close,cases_open])


	merge.to_csv('scrape_final.csv',index=False)

	# merge with parcels
	cases = gdf.merge(merge, on='PNUM')
	cases = cases[['Case Number','Case Type','Date','Description','Status','Open','geometry']]

	cases.to_file('cases.geojson',driver='GeoJSON')

	cases_stripped = cases[['Case Type','Status','Open','Date','geometry']]
	cases_stripped.to_file('cases_stripped.geojson',driver='GeoJSON')

	# get unique choices
	unique_types = cases['Case Type'].unique()
	choice_dict = {}
	for i in unique_types:
	df = cases.loc[cases['Case Type'] == i]
	unique_status = list(df.Status.unique())
	choice_dict[i] = unique_status

	with open('choices.json','w') as f:
	json.dump(choice_dict,f)


	if __name__ == '__main__':
	parcel_regex = re.compile(r'41-\d{2}-\d{2}-\d{3}-\d{3}')
	suffixes = ['ALY', 'ANX', 'ARC', 'AVE', 'YU', 'BCH', 'BND', 'BLF', 'BTM', 'BLVD', 'BR', 'BRG', 'BRK', 'BG', 'BYP',
	'CP', 'CYN', 'CPE', 'CSWY', 'CTR', 'CIR', 'CLFS', 'CLB', 'COR', 'CORS', 'CRSE', 'CT', 'CTS', 'CV',
	'CRK', 'CRES', 'XING', 'DL', 'DM', 'DV', 'DR', 'EST', 'EXPY', 'EXT', 'FALL', 'FLS', 'FRY', 'FLD',
	'FLDS', 'FLT', 'FOR', 'FRST', 'FGR', 'FORK', 'FRKS', 'FT', 'FWY', 'GDNS', 'GTWY', 'GLN', 'GN', 'GRV',
	'HBR', 'HVN', 'HTS', 'HWY', 'HL', 'HLS', 'HOLW', 'INLT', 'IS', 'ISS', 'ISLE', 'JCT', 'CY', 'KNLS', 'LK',
	'LKS', 'LNDG', 'LN', 'LGT', 'LF', 'LCKS', 'LDG', 'LOOP', 'MALL', 'MNR', 'MDWS', 'ML', 'MLS', 'MSN', 'MT',
	'MTN', 'NCK', 'ORCH', 'OVAL', 'PARK', 'PKY', 'PASS', 'PATH', 'PIKE', 'PNES', 'PL', 'PLN', 'PLNS', 'PLZ',
	'PT', 'PRT', 'PR', 'RADL', 'RNCH', 'RPDS', 'RST', 'RDG', 'RIV', 'RD', 'ROW', 'RUN', 'SHL', 'SHLS', 'SHR',
	'SHRS', 'SPG', 'SPGS', 'SPUR', 'SQ', 'STA', 'STRA', 'STRM', 'ST', 'SMT', 'TER', 'TRCE', 'TRAK', 'TRL',
	'TRLR', 'TUNL', 'TPKE', 'UN', 'VLY', 'VIA', 'VW', 'VLG', 'VL', 'VIS', 'WALK', 'WAY', 'WLS']

	with open('last-run.pkl','rb') as f:
	last_run = pickle.load(f)

	yesterday = date.today() - timedelta(days=1)
	yesterday_str = date.strftime(yesterday, '%m%d%Y')

	main()

	with open('last-run.pkl','wb') as f:
	today_str = date.strftime(date.today(),'%m%d%Y')
	pickle.dump(today_str,f)