Created
July 19, 2017 20:55
Star
You must be signed in to star a gist
A prototype data fusion workflow for the Python recordlinkage toolkit.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import recordlinkage as rl | |
#################### | |
# Pseudo Source Code | |
#################### | |
class FuseCore(object): | |
def __init__(self, vectors, df_a, df_b, suffix_a='_a', suffix_b='_b'): | |
pass | |
def _do_fuse(self, vectors, df_a, df_b, suffix_a, suffix_b): | |
# Fuse data frames | |
# Initially, keep all data for future resolution | |
pass | |
def resolve(self, fun, s1, s2, *args, **kwargs): | |
# Integrate two columns into one using the | |
# given conflict resolution strategy | |
pass | |
def fuse() | |
# Return the fused data frame | |
pass | |
class Fuse(FuseCore): | |
def trust_your_friends(self, c1, c2, trusted): | |
def _trust_your_friends(val_1, val_2, trust='df_a'): | |
# Implement trust your friends resolution method | |
pass | |
return self.resolve(_trust_your_friends, self.fused[c1], self.fused[c2], trust=trusted) | |
def no_gossiping(self, c1, c2): | |
def _no_gossiping(val_1, val_2, trust): | |
# Implement no gossiping resolution method | |
pass | |
return self.resolve(_no_gossiping, self.fused[c1], self.fused[c2]) | |
def take_the_information(self, c1, c2): | |
def _take_the_information(val_1, val_2): | |
# Implement take the information conflict resolution method | |
pass | |
return self.resolve(_take_the_information, self.fused[c1], self.fused[c2]) | |
################################ | |
# Pseudo Data Integration Script | |
################################ | |
# Original Data | |
my_df_a: pd.DataFrame | |
my_df_b: pd.DataFrame | |
# Do core data integration analysis | |
my_comp: rl.Compare | |
my_classi: rl.Classifier | |
# Refine your pairs | |
my_vectors = my_comp.vectors.iloc[my_classi.predict(my_comp.vectors)] | |
# Perform Data fusion | |
my_fuse = Fuse(my_vectors, my_comp.df_a, my_comp.df_b) | |
my_fuse.no_gossiping('col1', 'col2') | |
my_fuse.no_gossiping('col3', 'col4') | |
my_fuse.trust_your_friends('col5', 'col6', trusted='a') | |
my_fuse.take_the_information('col7', 'col8') | |
# Get output | |
my_integrated_data = my_fuse.fuse() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment