Skip to content

Instantly share code, notes, and snippets.

@pilgrim2go
Forked from sethc23/turnstile_funct.py
Created October 27, 2016 11:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pilgrim2go/7fb9f96dec1315d6ab45aa30d2aa38be to your computer and use it in GitHub Desktop.
Save pilgrim2go/7fb9f96dec1315d6ab45aa30d2aa38be to your computer and use it in GitHub Desktop.
f(x) z_get_string_dist
def z_get_string_dist(self):
cmd="""
DROP TYPE IF EXISTS string_dist_results cascade;
CREATE TYPE string_dist_results as (
idx integer,
orig_str text,
jaro double precision,
jaro_b text,
leven integer,
leven_b text,
nysiis text,
rating_codex text
);
DROP FUNCTION IF EXISTS z_get_string_dist(integer[],text,text,text[],
boolean,boolean,boolean,boolean,boolean);
CREATE OR REPLACE FUNCTION z_get_string_dist( idx integer[],
string_set text[],
compare_tbl text,
compare_col text[],
jaro boolean default true,
leven boolean default true,
nysiis boolean default true,
rating_codex boolean default true,
usps_repl_first boolean default true)
RETURNS SETOF string_dist_results AS $$
from jellyfish import cjellyfish as J
from traceback import format_exc as tb_format_exc
from sys import exc_info as sys_exc_info
class string_dist_results:
def __init__(self,upd=None):
if upd:
self.__dict__.update( upd)
important_cols = [ 'street_name','from_street_name','variation','primary_name','common_use',
'usps_abbr','pattern']
T = { 'tbl' : compare_tbl,
'concat_col' : ''.join(["concat_ws(' ',",
",".join(compare_col),
")"]),
'not_null_cols' : 'WHERE ' + ' is not null and '.join([it for it in compare_col
if important_cols.count(it)>0]) + ' is not null',
}
if T['not_null_cols']=='WHERE is not null':
T['not_null_cols'] = ''
#plpy.log(T)
try:
p = "select distinct ##(concat_col)s comparison from ##(tbl)s ##(not_null_cols)s;" ## T
res = plpy.execute(p)
if len(res)==0:
plpy.log( "string_dist_results: NO DATA AVAILABLE FROM ##(tbl)s IN ##(tbl)s" ## T)
return
else:
# plpy.log(res)
res = map(lambda s: unicode(s['comparison']),res)
#plpy.log("about to start")
for i in range(len(idx)):
#plpy.log("started")
_word = unicode(string_set[i].upper())
if not _word:
plpy.log( string_set)
plpy.log( "not word")
plpy.log( _word)
yield( None)
else:
t = { 'idx' : idx[i],
'orig_str' : _word }
if jaro:
# plpy.log(t)
t.update( dict(zip(['jaro','jaro_b'],
sorted(map(lambda s: (J.jaro_distance(_word,s),s),res ) )[-1:][0])))
if leven:
t.update( dict(zip(['leven','leven_b'],
sorted(map(lambda s: (J.levenshtein_distance(_word,s),s),res ) )[0:][0])))
if nysiis:
t.update( { 'nysiis' : J.nysiis(_word) })
if rating_codex:
t.update( { 'rating_codex' : J.match_rating_codex(_word) })
# plpy.log(t)
r = string_dist_results(t)
yield( r)
return
except Exception as e:
plpy.log( tb_format_exc())
plpy.log( sys_exc_info()[0])
plpy.log( e)
return
$$ LANGUAGE plpythonu;
""".replace('##','%')
self.T.conn.set_isolation_level( 0)
self.T.cur.execute( cmd)
return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment