Skip to content

Instantly share code, notes, and snippets.

@jaklinger
Created April 26, 2018 08:31
Show Gist options
  • Save jaklinger/b09713b2e3884b0db056b213870ef1fa to your computer and use it in GitHub Desktop.
Save jaklinger/b09713b2e3884b0db056b213870ef1fa to your computer and use it in GitHub Desktop.
Example of matching organisations to companies house, by name and address including fuzzy matchign
# Note superfuzz can be found here: https://github.com/jaklinger/nesta_toolbox/blob/master/sandbox/jaklinger/superfuzz/superfuzz.py
import requests
from retrying import retry
import re
import time
from collections import Counter
from superfuzz.superfuzz import superfuzz
from fuzzywuzzy import fuzz
from fuzzywuzzy import process as fuzzy_process
# Gather global stop words together
words = []
for k in ch_df["CompanyName"].values:
words += k.split()
stops = [w.lower() for w,_ in Counter(words).most_common(10)]
pattern = re.compile('[\W_]+')
ch_url = "https://api.companieshouse.gov.uk/search/companies"
postcodes_url = "https://api.postcodes.io/postcodes/"
ch_api_keys = ['GET_A_KEY',]
def destop(value):
return " ".join(x for x in value.split() if x.lower() not in stops).lower()
@retry(stop_max_attempt_number=5,wait_fixed=5000)
def query_ch(q):
params = dict(items_per_page=20, q=q)
auth = (random.choice(ch_api_keys),'')
r = requests.get(url=ch_url,auth=auth,params=params)
r.raise_for_status()
return r.json()
def get_lon_lat(postcode):
r = requests.get(url=postcodes_url+postcode)
r.raise_for_status()
pc_info = r.json()
lat = pc_info["result"]["latitude"]
lon = pc_info["result"]["longitude"]
return lon, lat
def match_by_postcode(postcode, query_result):
if postcode is None:
return None
for item in query_result["items"]:
if "address" not in item:
continue
if item["address"] is None:
continue
if "postal_code" not in item["address"]:
continue
_postcode = item["address"]["postal_code"]
if _postcode == postcode:
return item
def get_match(names_addresses, threshold=80, threshold_address=90):
name = names_addresses["name"]
address = names_addresses["address"]
_name = destop(name)
# Extract and format the postcode and address
postcode = None
if address is not None:
if "postCode" in address:
postcode = address["postCode"]
address = " ".join(v for k,v in address.items()
if v != "Unknown")
address = pattern.sub(' ', address)
# Query CH by name matches
r = query_ch(name)
if len(r['items']) > 0:
# No results so far, so try postcode match
# Get the lon, lat info for this postcode
try:
match = match_by_postcode(postcode, r)
except requests.exceptions.HTTPError:
pass
else:
if match is not None:
return match
for item in r["items"]:
score = superfuzz(_name,destop(item['title']),
[fuzz.partial_ratio,fuzz.token_sort_ratio])
# If the score is good, then trust Companies House's sorting algorithm
if score > threshold:
return item
# Query CH by address matches
if address is not None:
r = query_ch(name+" "+address)
if len(r["items"]) != 0:
# First try a name match
for item in r['items']:
score = superfuzz(_name,destop(item['title']),
[fuzz.partial_ratio,fuzz.token_sort_ratio])
# If the score is good, then trust Companies House's sorting algorithm
if score > threshold_address:
return item
# Otherwise try an address match
for item in r['items']:
if 'address_snippet' not in item:
continue
if item['address_snippet'] is None:
continue
_address = pattern.sub(' ', item['address_snippet'])
score = superfuzz(address,_address,[fuzz.partial_ratio,fuzz.token_sort_ratio])
if score > threshold:
return item
# No result
return None
if __name__ == "__main__":
names_addresses = [] # <-- Note list of dict entries [organisation_name --> address_string]
results == {}:
for i, data in enumerate(names_addresses):
name = data["name"]
if name in results:
continue
try:
result = get_match(data)
except:
print("Sleeping on",i,"of",len(names_addresses))
time.sleep(600)
result = get_match(data)
finally:
results[name] = result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment