Created
November 13, 2018 17:49
-
-
Save korkridake/ba9471801635e395b3bf5d32239820b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------ | |
# ------------------------------------------------ | |
# My Pythonic Implementation of Base-to-Master Searching | |
# Author: @Korkrid Akepanidtaworn | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
# Standard python library | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
import os | |
import pandas as pd | |
import numpy as np | |
import warnings | |
warnings.filterwarnings('ignore') | |
from datetime import timedelta | |
import monthdelta | |
from glob import glob | |
from functools import reduce | |
from IPython.core.interactiveshell import InteractiveShell | |
InteractiveShell.ast_node_interactivity = "all" | |
%config Completer.use_jedi = False | |
pd.options.display.max_rows = 999 | |
pd.options.display.max_columns = 999 | |
from IPython.core.display import display, HTML | |
display(HTML("<style>.container { width:90% !important; }</style>")) | |
%matplotlib inline | |
from IPython.display import display, HTML | |
pd.options.display.float_format = '{:20,.2f}'.format | |
# !pip install monthdelta | |
# !pip install tqdm | |
# !python -m pip install --upgrade pip | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
# Dev Test | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
for i in tqdm(range(10000)): | |
pass | |
from random import randint | |
print(randint(0, 9)) | |
import random | |
random.sample(range(100), 10) | |
random.choices(range(4), k=20) | |
def random_datetimes_or_dates(start, end, out_format='datetime', n=20): | |
''' | |
unix timestamp is in ns by default. | |
I divide the unix time value by 10**9 to make it seconds (or 24*60*60*10**9 to make it days). | |
The corresponding unit variable is passed to the pd.to_datetime function. | |
Values for the (divide_by, unit) pair to select is defined by the out_format parameter. | |
for 1 -> out_format='datetime' | |
for 2 -> out_format=anything else | |
''' | |
(divide_by, unit) = (10**9, 's') if out_format=='datetime' else (24*60*60*10**9, 'D') | |
start_u = start.value//divide_by | |
end_u = end.value//divide_by | |
return pd.to_datetime(np.random.randint(start_u, end_u, n), unit=unit) | |
start = pd.to_datetime('2015-01-01') | |
end = pd.to_datetime('2018-01-01') | |
random_datetimes_or_dates(start, end, out_format='not datetime') | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
# Create a Sample Dataframe | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
mydf_1 = pd.DataFrame({'ID':random.choices(range(4), k=20), | |
'yyyymm_1':random_datetimes_or_dates(start, end, out_format='not datetime'), | |
'other_info':random.sample(range(100), 20)}) | |
mydf_1 = mydf_1.sort_values(by = ['ID', 'yyyymm_1']) | |
mydf_1 | |
mydf_2 = pd.DataFrame({'ID':random.choices(range(6), k=5), | |
'yyyymm_2':random_datetimes_or_dates(start, end, out_format='not datetime', n = 5)}) | |
mydf_2 = mydf_2.sort_values(by = ['ID', 'yyyymm_2']) | |
mydf_2 | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
# Backlog Dev Test | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
for index, row in mydf_2.iterrows(): | |
print(row['ID'], row['yyyymm_2']) | |
# ------------------------------------------------ | |
for index, row in mydf_2.iterrows(): | |
individual_ID = row['ID'] | |
individual_yyyymm_2 = row['yyyymm_2'] | |
temp_data = mydf_1[mydf_1['ID'] == individual_ID] | |
temp_data['yyyymm_2'] = individual_yyyymm_2 | |
temp_data['elapased_months'] = temp_data['yyyymm_2'].dt.to_period('M') - temp_data['yyyymm_1'].dt.to_period('M') | |
temp_data['temp_flag'] = np.where(temp_data['elapased_months'].isin([-2, -1, 0 , 1]), 1, 0) | |
temp_data | |
# ------------------------------------------------ | |
for index, row in mydf_2.iterrows(): | |
individual_ID = row['ID'] | |
individual_yyyymm_2 = row['yyyymm_2'] | |
temp_data = mydf_1[mydf_1['ID'] == individual_ID] | |
len(temp_data) | |
# ------------------------------------------------ | |
df_final = pd.DataFrame() | |
for index, row in mydf_2.iterrows(): | |
individual_ID = row['ID'] | |
individual_yyyymm_2 = row['yyyymm_2'] | |
temp_data = mydf_1[mydf_1['ID'] == individual_ID] | |
if len(temp_data) == 0: | |
df_final = df_final.append(row) | |
else: | |
temp_data['yyyymm_2'] = individual_yyyymm_2 | |
temp_data['elapased_months'] = temp_data['yyyymm_2'].dt.to_period('M') - temp_data['yyyymm_1'].dt.to_period('M') | |
temp_data['temp_flag'] = np.where(temp_data['elapased_months'].isin([-2, -1, 0 , 1]), 1, 0) | |
if 1 in np.array(temp_data['temp_flag']): | |
continue | |
else: | |
df_final = df_final.append(row) | |
df_final | |
# ------------------------------------------------ | |
df_final = pd.DataFrame() | |
for index, row in mydf_2.iterrows(): | |
df_final.append(row) | |
df_final | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
# Contributed Function | |
# ------------------------------------------------ | |
# ------------------------------------------------ | |
def mydf_2_search_in_mydf_1_pandas_way(data_to_loop): | |
''' | |
Signature: dataframe -> dataframe | |
Author: @Korkrid Akepanidtaworn | |
Description: Loop through the base table (table 1) against the master table (table 2) | |
# array([[1.0, Timestamp('2015-12-04 00:00:00')], | |
# [2.0, Timestamp('2016-03-13 00:00:00')], | |
# [2.0, Timestamp('2017-08-27 00:00:00')], | |
# [4.0, Timestamp('2015-12-23 00:00:00')], | |
# [5.0, Timestamp('2017-08-23 00:00:00')]], dtype=object) | |
''' | |
df_final = pd.DataFrame() | |
for index, row in data_to_loop.iterrows(): | |
individual_ID = row['ID'] | |
individual_yyyymm_2 = row['yyyymm_2'] | |
temp_data = mydf_1[mydf_1['ID'] == individual_ID] | |
if len(temp_data) == 0: | |
df_final = df_final.append(row) | |
else: | |
temp_data['yyyymm_2'] = individual_yyyymm_2 | |
temp_data['elapased_months'] = temp_data['yyyymm_2'].dt.to_period('M') - temp_data['yyyymm_1'].dt.to_period('M') | |
temp_data['temp_flag'] = np.where(temp_data['elapased_months'].isin([-2, -1, 0 , 1]), 1, 0) | |
if 1 in np.array(temp_data['temp_flag']): | |
continue | |
else: | |
df_final = df_final.append(row) | |
return df_final | |
mydf_3 = pd.DataFrame({'ID':random.choices(range(6), k=5), | |
'yyyymm_2':random_datetimes_or_dates(start, end, out_format='not datetime', n = 5)}) | |
mydf_3 = mydf_3.sort_values(by = ['ID', 'yyyymm_2']) | |
mydf_3 | |
mydf_2_search_in_mydf_1_pandas_way(mydf_3) | |
# array([[1.0, Timestamp('2015-12-04 00:00:00')], | |
# [2.0, Timestamp('2016-03-13 00:00:00')], | |
# [2.0, Timestamp('2017-08-27 00:00:00')], | |
# [4.0, Timestamp('2015-12-23 00:00:00')], | |
# [5.0, Timestamp('2017-08-23 00:00:00')]], dtype=object) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment