Skip to content

Instantly share code, notes, and snippets.

@andylolz
Created July 6, 2013 00:19
Show Gist options
  • Save andylolz/5937994 to your computer and use it in GitHub Desktop.
Save andylolz/5937994 to your computer and use it in GitHub Desktop.
For all public bodies listed on gov.uk, find the best match currently on publicbodies.org
import csv
import urllib2
from bs4 import BeautifulSoup, UnicodeDammit
import Levenshtein
class compare_pb_gov_uk():
def __init__(self):
# fetch list of titles of GB public bodies already in publicbodies.org
def fetch_pb(self):
pb_url = 'https://raw.github.com/okfn/publicbodies/master/data/gb.csv'
pb_reader = csv.DictReader(urllib2.urlopen(pb_url))
return {UnicodeDammit(x['title']).unicode_markup: None for x in pb_reader}.keys()
# scrape list of titles of GB public bodies defined on gov.uk
def fetch_gov_uk(self):
gov_uk_url = 'https://www.gov.uk/government/organisations'
gov_uk_html = ''.join(urllib2.urlopen(gov_uk_url).readlines())
gov_uk_soup = BeautifulSoup(gov_uk_html)
org_lis = gov_uk_soup.find_all('li', class_='organisation')
return {x.a.text.strip(): None for x in org_lis}.keys()
# find best string match in a list
# return the tuple (best score, needle, best match)
def best_match(self, needle, haystack):
distances = [Levenshtein.ratio(needle, x) for x in haystack]
best_pos = distances.index(max(distances))
return distances[best_pos], needle, haystack[best_pos]
# for all public bodies listed on gov.uk,
# find the best match currently on publicbodies.org
def compare(self):
pb_titles = self.fetch_pb()
gov_uk_titles = self.fetch_gov_uk()
return sorted([self.best_match(x, pb_titles) for x in gov_uk_titles], reverse=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment