This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from unidecode import unidecode | |
def fingerprint(string): | |
# change all characters to their lowercase representation | |
string = string.lower() | |
# remove all punctuation and control characters | |
string = re.sub("[^A-Za-z0-9 ]+", "", string) | |
# normalize extended western characters to their ASCII representation |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unicodedata | |
def normalize_string(string): | |
if isinstance(string, str): | |
nfkd_form = unicodedata.normalize('NFKD', string.lower()) | |
return nfkd_form.encode('ASCII', 'ignore').decode('utf-8') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _uppercase_for_dict_keys(lower_dict): | |
upper_dict = {} | |
for k, v in lower_dict.items(): | |
if isinstance(v, dict): | |
v = _uppercase_for_dict_keys(v) | |
upper_dict[k.upper()] = v | |
return upper_dict |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
# We've now imported the two packages that will do the heavy lifting | |
# for us, reqeusts and BeautifulSoup | |
# Let's put the URL of the page we want to scrape in a variable | |
# so that our code down below can be a little cleaner | |
url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re, string | |
from unidecode import unidecode | |
PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation)) | |
class Fingerprinter(object): | |
''' | |
Python implementation of Google Refine fingerprinting algorithm described here: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
de | |
a | |
o | |
que | |
e | |
do | |
da | |
em | |
um | |
para |