Created
June 25, 2015 05:45
-
-
Save Thibauth/dd900c6ddf31855fb2a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from Levenshtein import setratio | |
import argparse | |
import re | |
import sys | |
def cluster(strings, threshold=0.8): | |
non_alpha = re.compile("[^a-z0-9]+") | |
clusters = {} | |
for string in strings: | |
nzd_string = non_alpha.sub(" ", string.strip().lower()).strip() | |
for s in clusters: | |
if setratio(s.split(), nzd_string.split()) > threshold: | |
clusters[s].add(string) | |
break | |
else: | |
clusters[nzd_string] = set([string]) | |
return clusters | |
def output_clusters(clusters, fo): | |
keys = sorted(clusters.iterkeys()) | |
for key in keys: | |
cluster = list(clusters[key]) | |
fo.write(cluster[0] + "\n") | |
fo.write("\t" + "\n\t".join(cluster) + "\n") | |
def read_clusters(fo): | |
m = {} | |
for line in fo: | |
stripped = line.rstrip("\n") | |
if stripped[0] != "\t": | |
current = stripped | |
else: | |
m[stripped[1:]] = current | |
return m | |
def apply_map(m, strings): | |
for string in strings: | |
yield m[string] | |
def normalize_command(args): | |
m = read_clusters(args.cluster_file) | |
strings = (line.rstrip("\n") for line in args.infile) | |
for ns in apply_map(m, strings): | |
args.output.write(ns + "\n") | |
def cluster_command(args): | |
strings = (line.rstrip("\n") for line in args.infile) | |
output_clusters(cluster(strings, args.threshold), args.outfile) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
subparsers = parser.add_subparsers() | |
cluster_parser = subparsers.add_parser("cluster") | |
normalize_parser = subparsers.add_parser("normalize") | |
cluster_parser.add_argument("--threshold", "-t", type=float, default=0.8) | |
cluster_parser.add_argument("infile", nargs="?", | |
type=argparse.FileType('r'), default=sys.stdin) | |
cluster_parser.add_argument("outfile", nargs="?", default=sys.stdout, | |
type=argparse.FileType('w')) | |
cluster_parser.set_defaults(func=cluster_command) | |
normalize_parser.add_argument("-o", "--output", default=sys.stdout, | |
type=argparse.FileType('w')) | |
normalize_parser.add_argument("cluster_file", type=argparse.FileType('r')) | |
normalize_parser.add_argument("infile", nargs="?", default=sys.stdin, | |
type=argparse.FileType('r')) | |
normalize_parser.set_defaults(func=normalize_command) | |
args = parser.parse_args() | |
args.func(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment