Skip to content

Instantly share code, notes, and snippets.

View rhhernandes's full-sized avatar

Raphael Hernandes rhhernandes

View GitHub Profile
import re
from unidecode import unidecode
def fingerprint(string):
# change all characters to their lowercase representation
string = string.lower()
# remove all punctuation and control characters
string = re.sub("[^A-Za-z0-9 ]+", "", string)
# normalize extended western characters to their ASCII representation
@Irio
Irio / normalize_string.py
Created December 12, 2017 15:53
normalize_string.py
import unicodedata
def normalize_string(string):
if isinstance(string, str):
nfkd_form = unicodedata.normalize('NFKD', string.lower())
return nfkd_form.encode('ASCII', 'ignore').decode('utf-8')
@avalanchy
avalanchy / upper.py
Created September 28, 2016 12:21
covert all dict keys to uppercase in python.
def _uppercase_for_dict_keys(lower_dict):
upper_dict = {}
for k, v in lower_dict.items():
if isinstance(v, dict):
v = _uppercase_for_dict_keys(v)
upper_dict[k.upper()] = v
return upper_dict
@phillipsm
phillipsm / gist:0ed98b2585f0ada5a769
Last active November 25, 2022 14:02
Example of parsing a table using BeautifulSoup and requests in Python
import requests
from bs4 import BeautifulSoup
# We've now imported the two packages that will do the heavy lifting
# for us, reqeusts and BeautifulSoup
# Let's put the URL of the page we want to scrape in a variable
# so that our code down below can be a little cleaner
url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/'
@cjdd3b
cjdd3b / fingerprint.py
Created February 22, 2015 14:17
Python implementation of Google Refine fingerprinting algorithms here: https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth
# -*- coding: utf-8 -*-
import re, string
from unidecode import unidecode
PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
class Fingerprinter(object):
'''
Python implementation of Google Refine fingerprinting algorithm described here:
@alopes
alopes / stopwords.txt
Created April 10, 2013 20:32
Portuguese stop words
de
a
o
que
e
do
da
em
um
para