Skip to content

Instantly share code, notes, and snippets.

@windweller
Last active September 2, 2020 00:57
Show Gist options
  • Save windweller/2e352613ff917ad4cf45034fc8313e41 to your computer and use it in GitHub Desktop.
Save windweller/2e352613ff917ad4cf45034fc8313e41 to your computer and use it in GitHub Desktop.
A very simple Geochecker!
import urllib.request
import csv
from collections import defaultdict
import nltk
class GeoExtractor(object):
def __init__(self):
self.zipcode_to_state = {}
self.statenames_to_state = {}
self.countynames_to_states = defaultdict(set)
self.citynames_to_states = defaultdict(set)
self.initialize_database()
self.zipcode_match_set = set(self.zipcode_to_state.keys())
self.statename_match_set = set(self.statenames_to_state.keys())
self.countyname_match_set = set(self.countynames_to_states.keys())
self.cityname_match_set = set(self.countynames_to_states.keys())
def initialize_database(self):
geo_data_url = "https://github.com/scpike/us-state-county-zip/raw/master/geo-data.csv"
response = urllib.request.urlopen(geo_data_url)
data = response.read() # a `bytes` object
text = data.decode('utf-8')
state_data_csv = []
for i, line in enumerate(text.split('\n')):
if i == 0:
continue
rows = line.split(',')
_, state_name, state_abbr, zipcode, county, city = rows
self.zipcode_to_state[zipcode] = state_name
self.statenames_to_state[state_name] = state_name
self.statenames_to_state[state_abbr] = state_name
self.countynames_to_states[county].add(state_name)
self.citynames_to_states[city].add(state_name)
def check_overlap(self, keyword_match_set, text_set, state_dic):
overlap = keyword_match_set.intersection(text_set)
state_names = set()
for o in overlap:
if type(state_dic[o]) == set:
state_names.update(state_dic[o])
else:
state_names.add(state_dic[o])
return len(overlap) > 0, state_names
def extract_state_from_text(self, text):
# This method uses a simple priority matching
# 1. If there's ZipCode, we will use it and return state
# 2. If there's state name / abbr (upper case), we will use it and return state
# 3. If there's a city name, full match, we will return all
tokens = nltk.word_tokenize(text)
text_set = set(tokens)
state_proposals = []
zip_check, state_names = self.check_overlap(self.zipcode_match_set, text_set, self.zipcode_to_state)
state_proposals.extend(state_names)
statename_check, state_names = self.check_overlap(self.statename_match_set, text_set, self.statenames_to_state)
state_proposals.extend(state_names)
# the reason is that city/county level is a lot messier
if zip_check or statename_check:
return state_proposals
county_check, state_names = self.check_overlap(self.countyname_match_set, text_set, self.countynames_to_states)
state_proposals.extend(state_names)
if county_check:
return state_proposals
_, state_names = self.check_overlap(self.cityname_match_set, text_set, self.citynames_to_states)
return state_names
geoe = GeoExtractor()
print(geoe.extract_state_from_text("I live in 94506, TX"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment