-
-
Save pilgrim2go/7fb9f96dec1315d6ab45aa30d2aa38be to your computer and use it in GitHub Desktop.
f(x) z_get_string_dist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def z_get_string_dist(self): | |
cmd=""" | |
DROP TYPE IF EXISTS string_dist_results cascade; | |
CREATE TYPE string_dist_results as ( | |
idx integer, | |
orig_str text, | |
jaro double precision, | |
jaro_b text, | |
leven integer, | |
leven_b text, | |
nysiis text, | |
rating_codex text | |
); | |
DROP FUNCTION IF EXISTS z_get_string_dist(integer[],text,text,text[], | |
boolean,boolean,boolean,boolean,boolean); | |
CREATE OR REPLACE FUNCTION z_get_string_dist( idx integer[], | |
string_set text[], | |
compare_tbl text, | |
compare_col text[], | |
jaro boolean default true, | |
leven boolean default true, | |
nysiis boolean default true, | |
rating_codex boolean default true, | |
usps_repl_first boolean default true) | |
RETURNS SETOF string_dist_results AS $$ | |
from jellyfish import cjellyfish as J | |
from traceback import format_exc as tb_format_exc | |
from sys import exc_info as sys_exc_info | |
class string_dist_results: | |
def __init__(self,upd=None): | |
if upd: | |
self.__dict__.update( upd) | |
important_cols = [ 'street_name','from_street_name','variation','primary_name','common_use', | |
'usps_abbr','pattern'] | |
T = { 'tbl' : compare_tbl, | |
'concat_col' : ''.join(["concat_ws(' ',", | |
",".join(compare_col), | |
")"]), | |
'not_null_cols' : 'WHERE ' + ' is not null and '.join([it for it in compare_col | |
if important_cols.count(it)>0]) + ' is not null', | |
} | |
if T['not_null_cols']=='WHERE is not null': | |
T['not_null_cols'] = '' | |
#plpy.log(T) | |
try: | |
p = "select distinct ##(concat_col)s comparison from ##(tbl)s ##(not_null_cols)s;" ## T | |
res = plpy.execute(p) | |
if len(res)==0: | |
plpy.log( "string_dist_results: NO DATA AVAILABLE FROM ##(tbl)s IN ##(tbl)s" ## T) | |
return | |
else: | |
# plpy.log(res) | |
res = map(lambda s: unicode(s['comparison']),res) | |
#plpy.log("about to start") | |
for i in range(len(idx)): | |
#plpy.log("started") | |
_word = unicode(string_set[i].upper()) | |
if not _word: | |
plpy.log( string_set) | |
plpy.log( "not word") | |
plpy.log( _word) | |
yield( None) | |
else: | |
t = { 'idx' : idx[i], | |
'orig_str' : _word } | |
if jaro: | |
# plpy.log(t) | |
t.update( dict(zip(['jaro','jaro_b'], | |
sorted(map(lambda s: (J.jaro_distance(_word,s),s),res ) )[-1:][0]))) | |
if leven: | |
t.update( dict(zip(['leven','leven_b'], | |
sorted(map(lambda s: (J.levenshtein_distance(_word,s),s),res ) )[0:][0]))) | |
if nysiis: | |
t.update( { 'nysiis' : J.nysiis(_word) }) | |
if rating_codex: | |
t.update( { 'rating_codex' : J.match_rating_codex(_word) }) | |
# plpy.log(t) | |
r = string_dist_results(t) | |
yield( r) | |
return | |
except Exception as e: | |
plpy.log( tb_format_exc()) | |
plpy.log( sys_exc_info()[0]) | |
plpy.log( e) | |
return | |
$$ LANGUAGE plpythonu; | |
""".replace('##','%') | |
self.T.conn.set_isolation_level( 0) | |
self.T.cur.execute( cmd) | |
return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment