Last active
November 26, 2017 20:56
-
-
Save joelbecker/fa6f29ad19790cfaa4c9adf14455e2b1 to your computer and use it in GitHub Desktop.
A recordlinkage data fusion example.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from random import randrange | |
import recordlinkage as rl | |
import recordlinkage.algorithms.conflict_resolution as cr | |
from recordlinkage.datasets import load_febrl4 | |
dfA, dfB = load_febrl4() | |
# Adapt dataset for example | |
dfA['date_of_birth'] = dfA['date_of_birth'].apply(float) | |
dfB['date_of_birth'] = dfB['date_of_birth'].apply(float) | |
dfA['dates_updated'] = [datetime(randrange(2000, 2017), randrange(1, 12), randrange(1, 28)) for _ in range(len(dfA))] | |
dfB['dates_updated'] = [datetime(randrange(2000, 2017), randrange(1, 12), randrange(1, 28)) for _ in range(len(dfB))] | |
dfA['salary'] = [randrange(40000, 120000) for _ in range(len(dfA))] | |
dfB['salary'] = [randrange(40000, 120000) for _ in range(len(dfB))] | |
dfA['min'] = [randrange(10, 20) for _ in range(len(dfA))] | |
dfB['min'] = [randrange(10, 20) for _ in range(len(dfB))] | |
dfA['max'] = [randrange(20, 30) for _ in range(len(dfA))] | |
dfB['max'] = [randrange(20, 30) for _ in range(len(dfB))] | |
# Sample data subsets | |
dfA = dfA.sample(200) | |
dfB = dfB.sample(200) | |
# Indexation step | |
indexer = rl.BlockIndex(on='given_name') | |
pairs = indexer.index(dfA, dfB) | |
# Comparison step | |
compare_cl = rl.Compare(pairs=pairs, df_a=dfA, df_b=dfB) | |
compare_cl.exact('given_name', 'given_name') | |
compare_cl.string('surname', 'surname', method='jarowinkler', threshold=0.85) | |
compare_cl.exact('date_of_birth', 'date_of_birth') | |
compare_cl.exact('suburb', 'suburb') | |
compare_cl.exact('state', 'state') | |
compare_cl.string('address_1', 'address_1', threshold=0.85) | |
features = compare_cl.vectors | |
# Classification step | |
matches = features.sum(axis=1) > 3 | |
# Fusion step | |
fuse = rl.FuseLinks() | |
# Prefer values in dataframe a | |
fuse.trust_your_friends('given_name', 'given_name', trusted='a', name='given_name') | |
# Choose values from the row that was updated most recently | |
fuse.keep_up_to_date('surname', 'surname', 'dates_updated', 'dates_updated', name='surname') | |
# Take the average of salary values | |
fuse.meet_in_the_middle('salary', 'salary', metric='mean', name='salary') | |
# Choose randomly between street numbers | |
fuse.roll_the_dice('street_number', 'street_number', name='street_number') | |
# Keep all social security id values for future processing. | |
fuse.pass_it_on('soc_sec_id', 'soc_sec_id', name='soc_sec_id') | |
# Handle data conflicts between multiple columns in each data frame | |
fuse.meet_in_the_middle(['min', 'max'], ['min', 'max'], metric='stdev', name='spread') | |
# Create custom conflict handling strategies with the resolve method | |
fuse.resolve( | |
cr.choose_longest, | |
['address_1', 'address_2'], | |
['address_1', 'address_2'], | |
tie_break=cr.choose_random, | |
name='longest_address' | |
) | |
# Execute the scheduled conflict resolution jobs for the given | |
# candidate links, data, and classifications. | |
fused = fuse.fuse(pairs, dfA, dfB, matches) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Useful!
The Compare API has changed in the previous version. I found out that the deprecation warning was not visible by default. This was changed in the development version (commit: J535D165/recordlinkage@73f5b08). So in the next version, this example will output a DeprecationWarning.
The new version for the example above: