Skip to content

Instantly share code, notes, and snippets.

@jiweiqi
Created December 5, 2021 14:51
Show Gist options
  • Save jiweiqi/c0d2dc853786e6350007e6e56d8a029f to your computer and use it in GitHub Desktop.
Save jiweiqi/c0d2dc853786e6350007e6e56d8a029f to your computer and use it in GitHub Desktop.
find the nearest 2-year and 4-year college based on zipcode
import re
import pgeocode
from tqdm import tqdm
import pandas as pd
import numpy as np
df_college_4 = pd.read_excel('./4-year updated.xlsx',
skiprows=4, usecols='B:I')
df_college_4['zipcode_full'] = '02139'
df_college_4['zipcode'] = '02139'
for i in range(df_college_4.shape[0]):
address = df_college_4.iloc[i]['Address']
postal_code = re.search(r'.*(\d{5}(\-\d{4})?)$', address)
df_college_4.loc[i, 'zipcode_full'] = postal_code.groups()[0]
df_college_4.loc[i, 'zipcode'] = postal_code.groups()[0][0:5]
college_zipcode_4 = df_college_4.zipcode.values
df_college_4.to_excel('./4-year updated zipcode.xlsx')
df_college_2 = pd.read_excel('./2-year updated.xlsx',
skiprows=4, usecols='B:I')
df_college_2['zipcode_full'] = '02139'
df_college_2['zipcode'] = '02139'
for i in range(df_college_2.shape[0]):
address = df_college_2.iloc[i]['Address']
postal_code = re.search(r'.*(\d{5}(\-\d{4})?)$', address)
df_college_2.loc[i, 'zipcode_full'] = postal_code.groups()[0]
df_college_2.loc[i, 'zipcode'] = postal_code.groups()[0][0:5]
college_zipcode_2 = df_college_2.zipcode.values
df_college_2.to_excel('./2-year updated zipcode.xlsx')
# nomi = pgeocode.Nominatim('us')
# nomi.query_postal_code("35294")
# nomi.query_postal_code(["07306", "02139"])
dist = pgeocode.GeoDistance('us')
# dist.query_postal_code("07306", "02139") # km
# dist.query_postal_code(["07306"], ["07306", "02138"])
df_highschool = pd.read_excel('./SAS_data/byf1tsch.xlsx',
usecols=['SCH_ID', 'BYSCHZIP'])
l_zip = df_highschool['BYSCHZIP'].astype(str).str.zfill(5)
df_highschool['dist_4'] = 100000
df_highschool['dist_2'] = 100000
for i in tqdm(range(df_highschool.shape[0])):
zipcode = df_highschool.BYSCHZIP[i]
if zipcode < 0:
print("Nan zip")
continue
zipcode = l_zip[i]
dist_arr = dist.query_postal_code(zipcode, college_zipcode_4)
dist_arr[np.argwhere(np.isnan(dist_arr))] = 100000
df_highschool.dist_4[i] = np.min(dist_arr)
dist_arr = dist.query_postal_code(zipcode, college_zipcode_2)
dist_arr[np.argwhere(np.isnan(dist_arr))] = 100000
df_highschool.dist_2[i] = np.min(dist_arr)
df_highschool.to_excel('./SAS_data/byf1tsch_dist.xlsx')
# df_private = pd.read_csv('private high school zipcode.csv',
# usecols=['PINST', 'PCITY', 'PSTABB', 'PZIP'])
# df_private
# col_PZIP = df_private.PZIP.values
# df_private['dist_college'] = 100000
# df_public = pd.read_csv('./public school zipcode.csv',
# usecols=['SCH_NAME', 'MCITY', 'MSTATE', 'MZIP'])
# df_public.rename(columns={"SCH_NAME": "PINST", "MCITY": "PCITY",
# "MSTATE": "PSTABB", "MZIP": "PZIP"}, inplace=True)
# df_public
# col_MZIP = df_public.PZIP.values
# df_public['dist_college'] = 100000
# df_highschool = pd.concat([df_private, df_public], ignore_index=True)
# df_highschool
# # select columns
# df_survey = pd.read_csv('./bps09derived_datafile.csv',
# usecols=['SCH_NAME', 'MCITY', 'MSTATE', 'MZIP'])
# for i in tqdm(range(df_private.shape[0])):
# dist_arr = dist.query_postal_code(str(col_PZIP[i]), college_zipcode)
# dist_arr[np.argwhere(np.isnan(dist_arr))] = 100000
# df_private[i, 'dist_college'] = dist_arr.min()
# df_private.to_csv('./private high school zipcode dist.csv')
# for i in tqdm(range(df_public.shape[0])):
# dist_arr = dist.query_postal_code(str(col_PZIP[i]), college_zipcode)
# dist_arr[np.argwhere(np.isnan(dist_arr))] = 100000
# df_public[i, 'dist_college'] = dist_arr.min()
# df_public.to_csv('./public school zipcode dist.csv')
# address = 'Administration Bldg Suite 1070, Birmingham, AL 35294'
# postal_code = re.search(r'.*(\d{5}(\-\d{4})?)$', address)
# postal_code.groups()
# import googlemaps
# from datetime import datetime
# gmaps = googlemaps.Client(key='xx')
# Geocoding an address
# geocode_result = gmaps.geocode('Administration Bldg Suite 1070, Birmingham, AL 35294-0110')
# print(geocode_result[0]['address_components'][-1]['short_name'])
# print(geocode_result[0]['geometry']['location'])
# Look up an address with reverse geocoding
# reverse_geocode_result = gmaps.reverse_geocode((40.714224, -73.961452))
# print(geocode_result, reverse_geocode_result)
# geocode_result[0]['geometry']['location']['lat']
# Request directions via public transit
# now = datetime.now()
# directions_result = gmaps.directions("Sydney Town Hall",
# "Parramatta, NSW",
# mode="transit",
# departure_time=now)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment