andylolz/compare_pb_gov_uk.py

## compare_pb_gov_uk.py
import csv
import urllib2

from bs4 import BeautifulSoup, UnicodeDammit
import Levenshtein


class compare_pb_gov_uk():
    def __init__(self):

    # fetch list of titles of GB public bodies already in publicbodies.org
    def fetch_pb(self):
        pb_url = 'https://raw.github.com/okfn/publicbodies/master/data/gb.csv'
        pb_reader = csv.DictReader(urllib2.urlopen(pb_url))
        return {UnicodeDammit(x['title']).unicode_markup: None for x in pb_reader}.keys()

    # scrape list of titles of GB public bodies defined on gov.uk
    def fetch_gov_uk(self):
        gov_uk_url = 'https://www.gov.uk/government/organisations'
        gov_uk_html = ''.join(urllib2.urlopen(gov_uk_url).readlines())
        gov_uk_soup = BeautifulSoup(gov_uk_html)

        org_lis = gov_uk_soup.find_all('li', class_='organisation')
        return {x.a.text.strip(): None for x in org_lis}.keys()

    # find best string match in a list
    # return the tuple (best score, needle, best match)
    def best_match(self, needle, haystack):
        distances = [Levenshtein.ratio(needle, x) for x in haystack]
        best_pos = distances.index(max(distances))
        return distances[best_pos], needle, haystack[best_pos]

    # for all public bodies listed on gov.uk,
    # find the best match currently on publicbodies.org
    def compare(self):
        pb_titles = self.fetch_pb()
        gov_uk_titles = self.fetch_gov_uk()

        return sorted([self.best_match(x, pb_titles) for x in gov_uk_titles], reverse=True)
	import csv
	import urllib2

	from bs4 import BeautifulSoup, UnicodeDammit
	import Levenshtein


	class compare_pb_gov_uk():
	def __init__(self):

	# fetch list of titles of GB public bodies already in publicbodies.org
	def fetch_pb(self):
	pb_url = 'https://raw.github.com/okfn/publicbodies/master/data/gb.csv'
	pb_reader = csv.DictReader(urllib2.urlopen(pb_url))
	return {UnicodeDammit(x['title']).unicode_markup: None for x in pb_reader}.keys()

	# scrape list of titles of GB public bodies defined on gov.uk
	def fetch_gov_uk(self):
	gov_uk_url = 'https://www.gov.uk/government/organisations'
	gov_uk_html = ''.join(urllib2.urlopen(gov_uk_url).readlines())
	gov_uk_soup = BeautifulSoup(gov_uk_html)

	org_lis = gov_uk_soup.find_all('li', class_='organisation')
	return {x.a.text.strip(): None for x in org_lis}.keys()

	# find best string match in a list
	# return the tuple (best score, needle, best match)
	def best_match(self, needle, haystack):
	distances = [Levenshtein.ratio(needle, x) for x in haystack]
	best_pos = distances.index(max(distances))
	return distances[best_pos], needle, haystack[best_pos]

	# for all public bodies listed on gov.uk,
	# find the best match currently on publicbodies.org
	def compare(self):
	pb_titles = self.fetch_pb()
	gov_uk_titles = self.fetch_gov_uk()

	return sorted([self.best_match(x, pb_titles) for x in gov_uk_titles], reverse=True)