Created
November 22, 2020 19:49
-
-
Save ronniejoshua/e267b41a54d10017a4a7ac5d4ee8c496 to your computer and use it in GitHub Desktop.
Implementation of Join - Using Key Column & Data Generation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from data_extractor.nobel_api import nobel_api_laureates | |
from data_extractor.jinfo_data import jewish_nobel_winners | |
from data_extractor.utils import nobel_laureates_dataframe | |
if __name__ == "__main__": | |
# Extract data | |
df_jew = pd.DataFrame(jewish_nobel_winners()) | |
df_nobel = pd.DataFrame(nobel_api_laureates()) | |
# Mapping Categories between two dataframes | |
dict_key = { | |
"Economic Sciences": "Economics", | |
"Physics": "Physics", | |
"Chemistry": "Chemistry", | |
"Peace": "Peace", | |
"Physiology or Medicine": "Medicine", | |
"Literature": "Literature", | |
} | |
# Create a new col - Mapping the "Nobel Categories" | |
df_nobel.loc[:, "new_category"] = df_nobel["category"].map(dict_key) | |
# Creating key_col to join the two dataframes to findout jewish laureates | |
# fuzzy match is based on the name, hence we normalize the name | |
def convert_name(x): | |
ans = str(x).lower().replace(" ", "") | |
return ans.replace(".", "") | |
# Creating the Key Column {Fuzzy Match Based on Name, year & Category} | |
# Note the use of () to chain the operations | |
df_nobel.loc[:, "key_col"] = ( | |
df_nobel["knownName"].apply(lambda x: convert_name(x)) | |
+ df_nobel["awardYear"].map(str) | |
+ df_nobel["new_category"].map(str).map(str.lower) | |
) | |
df_jew.loc[:, "key_col"] = ( | |
df_jew["jinfo_laureate"].map(str).map(str.lower).apply(lambda x: x.replace(" ", "")) | |
+ df_jew["jinfo_award_year"].map(str) | |
+ df_jew["jinfo_category"].map(str).map(str.lower) | |
) | |
# Create a df_matched - which combines the two dfs and does a fuzzy join | |
df_matched = nobel_laureates_dataframe(df_nobel, df_jew) | |
df_matched.loc[:, "check"] = df_matched["matches"].apply(lambda x: len(x)) | |
df_jew_matched = df_matched[df_matched["check"] > 0] | |
print(df_jew_matched.shape) | |
df_matched.to_csv("./data.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment