Skip to content

Instantly share code, notes, and snippets.

@joelbecker
Created July 19, 2017 20:55
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save joelbecker/53b6a2afd75567091a1d9bd0e45e2b31 to your computer and use it in GitHub Desktop.
A prototype data fusion workflow for the Python recordlinkage toolkit.
import pandas as pd
import recordlinkage as rl
####################
# Pseudo Source Code
####################
class FuseCore(object):
def __init__(self, vectors, df_a, df_b, suffix_a='_a', suffix_b='_b'):
pass
def _do_fuse(self, vectors, df_a, df_b, suffix_a, suffix_b):
# Fuse data frames
# Initially, keep all data for future resolution
pass
def resolve(self, fun, s1, s2, *args, **kwargs):
# Integrate two columns into one using the
# given conflict resolution strategy
pass
def fuse()
# Return the fused data frame
pass
class Fuse(FuseCore):
def trust_your_friends(self, c1, c2, trusted):
def _trust_your_friends(val_1, val_2, trust='df_a'):
# Implement trust your friends resolution method
pass
return self.resolve(_trust_your_friends, self.fused[c1], self.fused[c2], trust=trusted)
def no_gossiping(self, c1, c2):
def _no_gossiping(val_1, val_2, trust):
# Implement no gossiping resolution method
pass
return self.resolve(_no_gossiping, self.fused[c1], self.fused[c2])
def take_the_information(self, c1, c2):
def _take_the_information(val_1, val_2):
# Implement take the information conflict resolution method
pass
return self.resolve(_take_the_information, self.fused[c1], self.fused[c2])
################################
# Pseudo Data Integration Script
################################
# Original Data
my_df_a: pd.DataFrame
my_df_b: pd.DataFrame
# Do core data integration analysis
my_comp: rl.Compare
my_classi: rl.Classifier
# Refine your pairs
my_vectors = my_comp.vectors.iloc[my_classi.predict(my_comp.vectors)]
# Perform Data fusion
my_fuse = Fuse(my_vectors, my_comp.df_a, my_comp.df_b)
my_fuse.no_gossiping('col1', 'col2')
my_fuse.no_gossiping('col3', 'col4')
my_fuse.trust_your_friends('col5', 'col6', trusted='a')
my_fuse.take_the_information('col7', 'col8')
# Get output
my_integrated_data = my_fuse.fuse()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment