Skip to content

Instantly share code, notes, and snippets.

@Thibauth
Created June 25, 2015 05:45
Show Gist options
  • Save Thibauth/dd900c6ddf31855fb2a7 to your computer and use it in GitHub Desktop.
Save Thibauth/dd900c6ddf31855fb2a7 to your computer and use it in GitHub Desktop.
from Levenshtein import setratio
import argparse
import re
import sys
def cluster(strings, threshold=0.8):
non_alpha = re.compile("[^a-z0-9]+")
clusters = {}
for string in strings:
nzd_string = non_alpha.sub(" ", string.strip().lower()).strip()
for s in clusters:
if setratio(s.split(), nzd_string.split()) > threshold:
clusters[s].add(string)
break
else:
clusters[nzd_string] = set([string])
return clusters
def output_clusters(clusters, fo):
keys = sorted(clusters.iterkeys())
for key in keys:
cluster = list(clusters[key])
fo.write(cluster[0] + "\n")
fo.write("\t" + "\n\t".join(cluster) + "\n")
def read_clusters(fo):
m = {}
for line in fo:
stripped = line.rstrip("\n")
if stripped[0] != "\t":
current = stripped
else:
m[stripped[1:]] = current
return m
def apply_map(m, strings):
for string in strings:
yield m[string]
def normalize_command(args):
m = read_clusters(args.cluster_file)
strings = (line.rstrip("\n") for line in args.infile)
for ns in apply_map(m, strings):
args.output.write(ns + "\n")
def cluster_command(args):
strings = (line.rstrip("\n") for line in args.infile)
output_clusters(cluster(strings, args.threshold), args.outfile)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
cluster_parser = subparsers.add_parser("cluster")
normalize_parser = subparsers.add_parser("normalize")
cluster_parser.add_argument("--threshold", "-t", type=float, default=0.8)
cluster_parser.add_argument("infile", nargs="?",
type=argparse.FileType('r'), default=sys.stdin)
cluster_parser.add_argument("outfile", nargs="?", default=sys.stdout,
type=argparse.FileType('w'))
cluster_parser.set_defaults(func=cluster_command)
normalize_parser.add_argument("-o", "--output", default=sys.stdout,
type=argparse.FileType('w'))
normalize_parser.add_argument("cluster_file", type=argparse.FileType('r'))
normalize_parser.add_argument("infile", nargs="?", default=sys.stdin,
type=argparse.FileType('r'))
normalize_parser.set_defaults(func=normalize_command)
args = parser.parse_args()
args.func(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment