Skip to content

Instantly share code, notes, and snippets.

@RobinL
Created January 25, 2024 14:22
Show Gist options
  • Save RobinL/d8a84f7a31fa7cb17dafb05c94518225 to your computer and use it in GitHub Desktop.
Save RobinL/d8a84f7a31fa7cb17dafb05c94518225 to your computer and use it in GitHub Desktop.
string distance within arrays
import duckdb
import pandas as pd
data = {
'arr_1': [["robin","dave","james"]],
'arr_2': [["robyn","steve"]]
}
df = pd.DataFrame(data)
query = """
SELECT
flatten(list_transform(arr_1,
x -> (list_transform(arr_2, y -> [x,y]))
)) as all_pairs
FROM df
"""
df_c = duckdb.sql(query).df()
print(df_c.iloc[0,0])
query = """
SELECT
list_transform(all_pairs,
x -> levenshtein(x[1], x[2]) < 2
) as scores
FROM df_c
"""
df_scored = duckdb.sql(query).df()
print(duckdb.sql(query).df().iloc[0,0])
query = """
SELECT
list_reduce(scores, (x,y) -> x or y)
FROM df_scored
"""
print(duckdb.sql(query).df().iloc[0,0])
[['robin', 'robyn'], ['robin', 'steve'], ['dave', 'robyn'], ['dave', 'steve'], ['james', 'robyn'], ['james', 'steve']]
[True, False, False, False, False, False]
True
@JonnyNZCustoms
Copy link

Hi Robin,

I can follow the code above, but am unsure how to use it to create a custom SQL comparison. I have had a good look through the user guides and other training materials here and didn't find anything. Are you able to post a link to an example of creating a complicated custom SQL comparison like this? Or give some hints about how to deploy something like the code above into my Splink workflow?

Thanks so much, I am really enjoying working with Splink :)

@JonnyNZCustoms
Copy link

Thanks again Robin,

For anyone finding this thread later on, here is my full comparison code which works for me:

comparison_fuzzy_array_phone = {
    "output_column_name": "phone_numbers",
    "comparison_description": "Fuzzy array phone numbers",
    "comparison_levels": [
        {
            "sql_condition": "phone_numbers_l IS NULL OR phone_numbers_r IS NULL OR phone_numbers_l == [] OR phone_numbers_r == []",
            "label_for_charts": "Null",
            "is_null_level": True,
        },
        {
            "sql_condition": " list_reduce( list_transform(    flatten(list_transform(phone_numbers_l, x -> (list_transform(phone_numbers_r, y -> [x,y])))) , x -> levenshtein(x[1], x[2]) < 2  )    , (x,y) -> x or y ) ",        
            "label_for_charts": "Levenshtein <= 2",
        },
        {"sql_condition": "ELSE", "label_for_charts": "All other comparisons"},
    ],
}

@RobinL
Copy link
Author

RobinL commented Apr 19, 2024

Super - thanks. Just dropping in a full runnable example here for future ref:

import pandas as pd

import splink.duckdb.comparison_level_library as cll
from splink.duckdb.blocking_rule_library import block_on
from splink.duckdb.comparison_library import exact_match
from splink.duckdb.linker import DuckDBLinker

data = [
    {
        "unique_id": 1,
        "first_name": "John",
        "phone_numbers": ["123456", "654321"],
    },
    {
        "unique_id": 2,
        "first_name": "John",
        "phone_numbers": ["123455"],
    },
    {
        "unique_id": 3,
        "first_name": "John",
        "phone_numbers": ["123456"],
    },
    {
        "unique_id": 4,
        "first_name": "John",
        "phone_numbers": ["9999", "8888"],
    },
]

df = pd.DataFrame(data)


comparison_fuzzy_array_phone = {
    "output_column_name": "phone_numbers",
    "comparison_description": "Fuzzy array phone numbers",
    "comparison_levels": [
        {
            "sql_condition": "phone_numbers_l IS NULL OR phone_numbers_r IS NULL OR phone_numbers_l == [] OR phone_numbers_r == []",
            "label_for_charts": "Null",
            "is_null_level": True,
        },
        cll.array_intersect_level("phone_numbers", 1),
        {
            "sql_condition": " list_reduce( list_transform(    flatten(list_transform(phone_numbers_l, x -> (list_transform(phone_numbers_r, y -> [x,y])))) , x -> levenshtein(x[1], x[2]) < 2  )    , (x,y) -> x or y ) ",
            "label_for_charts": "Levenshtein <= 2",
        },
        {"sql_condition": "ELSE", "label_for_charts": "All other comparisons"},
    ],
}

settings = {
    "probability_two_random_records_match": 0.01,
    "link_type": "dedupe_only",
    "comparisons": [
        comparison_fuzzy_array_phone,
        exact_match("first_name"),
        
    ],
    "retain_intermediate_calculation_columns": True,
}


linker = DuckDBLinker(df, settings)

linker.predict().as_pandas_dataframe()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment