Skip to content

Instantly share code, notes, and snippets.

@ronniejoshua
Created November 22, 2020 19:49
Show Gist options
  • Save ronniejoshua/e267b41a54d10017a4a7ac5d4ee8c496 to your computer and use it in GitHub Desktop.
Save ronniejoshua/e267b41a54d10017a4a7ac5d4ee8c496 to your computer and use it in GitHub Desktop.
Implementation of Join - Using Key Column & Data Generation
import pandas as pd
from data_extractor.nobel_api import nobel_api_laureates
from data_extractor.jinfo_data import jewish_nobel_winners
from data_extractor.utils import nobel_laureates_dataframe
if __name__ == "__main__":
# Extract data
df_jew = pd.DataFrame(jewish_nobel_winners())
df_nobel = pd.DataFrame(nobel_api_laureates())
# Mapping Categories between two dataframes
dict_key = {
"Economic Sciences": "Economics",
"Physics": "Physics",
"Chemistry": "Chemistry",
"Peace": "Peace",
"Physiology or Medicine": "Medicine",
"Literature": "Literature",
}
# Create a new col - Mapping the "Nobel Categories"
df_nobel.loc[:, "new_category"] = df_nobel["category"].map(dict_key)
# Creating key_col to join the two dataframes to findout jewish laureates
# fuzzy match is based on the name, hence we normalize the name
def convert_name(x):
ans = str(x).lower().replace(" ", "")
return ans.replace(".", "")
# Creating the Key Column {Fuzzy Match Based on Name, year & Category}
# Note the use of () to chain the operations
df_nobel.loc[:, "key_col"] = (
df_nobel["knownName"].apply(lambda x: convert_name(x))
+ df_nobel["awardYear"].map(str)
+ df_nobel["new_category"].map(str).map(str.lower)
)
df_jew.loc[:, "key_col"] = (
df_jew["jinfo_laureate"].map(str).map(str.lower).apply(lambda x: x.replace(" ", ""))
+ df_jew["jinfo_award_year"].map(str)
+ df_jew["jinfo_category"].map(str).map(str.lower)
)
# Create a df_matched - which combines the two dfs and does a fuzzy join
df_matched = nobel_laureates_dataframe(df_nobel, df_jew)
df_matched.loc[:, "check"] = df_matched["matches"].apply(lambda x: len(x))
df_jew_matched = df_matched[df_matched["check"] > 0]
print(df_jew_matched.shape)
df_matched.to_csv("./data.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment