Skip to content

Instantly share code, notes, and snippets.

@RobinL
Created March 6, 2024 17:09
Show Gist options
  • Save RobinL/6e11c04aa1204ac3e7452eddd778ab4f to your computer and use it in GitHub Desktop.
Save RobinL/6e11c04aa1204ac3e7452eddd778ab4f to your computer and use it in GitHub Desktop.
disable exact match detection
import duckdb
import splink.duckdb.comparison_level_library as cll
import splink.duckdb.comparison_library as cl
from splink.duckdb.linker import DuckDBLinker
# duckdb sql statement that creates a few rows pertaining to a few people
# using UNION ALL
sql = """
SELECT '1' as unique_id, 'John' as first_name, ['a','b','c'] as test_array
UNION ALL
SELECT '2', 'Jane', ['a','e','f']
UNION ALL
SELECT '3', 'Jack', ['a','h','i']
UNION ALL
SELECT '4', 'Jill', ['a','k','l']
UNION ALL
SELECT '5', 'Joe', ['a','n','o']
"""
df_master = duckdb.sql(sql)
print(df_master)
sql = """
SELECT '5' as unique_id, 'John' as first_name, ['a'] as test_array
UNION ALL
SELECT '6', 'John', ['e']
UNION ALL
SELECT '7', 'Jane', ['e']
"""
df_long = duckdb.sql(sql)
print(df_long)
# Derive custom tf table
sql = """
with exploded as (
select unnest(test_array) as test_array
from df_master
)
select [test_array] as test_array, count(*)/(select count(*) from exploded) as tf_test_array
from exploded
group by test_array
"""
__splink__df_tf_test_array = duckdb.sql(sql).df()
__splink__df_tf_test_array
comparison_test_array = {
"output_column_name": "test_array",
"comparison_levels": [
cll.null_level("test_array"),
{
"sql_condition": 'array_length(list_intersect("test_array_l", "test_array_r"))>= 1',
"label_for_charts": "Arrays intersect",
"tf_adjustment_column": "test_array", # This controls which lookup to use, it's unrelated to the exact match level
"tf_adjustment_weight": 1.0,
"disable_tf_exact_match_detection": True, # Don't look for an exact match level, use own level instead
},
cll.else_level(),
],
"comparison_description": "arr",
}
settings = {
"probability_two_random_records_match": 0.01,
"link_type": "link_only",
"blocking_rules_to_generate_predictions": ["1=1"], # full cartesian product
"comparisons": [cl.exact_match("first_name"), comparison_test_array],
"retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(["df_long", "df_master"], settings)
linker.register_term_frequency_lookup(__splink__df_tf_test_array, "test_array")
import logging
logging.getLogger("splink").setLevel(1)
linker.predict().as_pandas_dataframe()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment