Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save korkridake/ba9471801635e395b3bf5d32239820b0 to your computer and use it in GitHub Desktop.
Save korkridake/ba9471801635e395b3bf5d32239820b0 to your computer and use it in GitHub Desktop.
# ------------------------------------------------
# ------------------------------------------------
# My Pythonic Implementation of Base-to-Master Searching
# Author: @Korkrid Akepanidtaworn
# ------------------------------------------------
# ------------------------------------------------
# ------------------------------------------------
# ------------------------------------------------
# Standard python library
# ------------------------------------------------
# ------------------------------------------------
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import timedelta
import monthdelta
from glob import glob
from functools import reduce
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config Completer.use_jedi = False
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%matplotlib inline
from IPython.display import display, HTML
pd.options.display.float_format = '{:20,.2f}'.format
# !pip install monthdelta
# !pip install tqdm
# !python -m pip install --upgrade pip
# ------------------------------------------------
# ------------------------------------------------
# Dev Test
# ------------------------------------------------
# ------------------------------------------------
for i in tqdm(range(10000)):
pass
from random import randint
print(randint(0, 9))
import random
random.sample(range(100), 10)
random.choices(range(4), k=20)
def random_datetimes_or_dates(start, end, out_format='datetime', n=20):
'''
unix timestamp is in ns by default.
I divide the unix time value by 10**9 to make it seconds (or 24*60*60*10**9 to make it days).
The corresponding unit variable is passed to the pd.to_datetime function.
Values for the (divide_by, unit) pair to select is defined by the out_format parameter.
for 1 -> out_format='datetime'
for 2 -> out_format=anything else
'''
(divide_by, unit) = (10**9, 's') if out_format=='datetime' else (24*60*60*10**9, 'D')
start_u = start.value//divide_by
end_u = end.value//divide_by
return pd.to_datetime(np.random.randint(start_u, end_u, n), unit=unit)
start = pd.to_datetime('2015-01-01')
end = pd.to_datetime('2018-01-01')
random_datetimes_or_dates(start, end, out_format='not datetime')
# ------------------------------------------------
# ------------------------------------------------
# Create a Sample Dataframe
# ------------------------------------------------
# ------------------------------------------------
mydf_1 = pd.DataFrame({'ID':random.choices(range(4), k=20),
'yyyymm_1':random_datetimes_or_dates(start, end, out_format='not datetime'),
'other_info':random.sample(range(100), 20)})
mydf_1 = mydf_1.sort_values(by = ['ID', 'yyyymm_1'])
mydf_1
mydf_2 = pd.DataFrame({'ID':random.choices(range(6), k=5),
'yyyymm_2':random_datetimes_or_dates(start, end, out_format='not datetime', n = 5)})
mydf_2 = mydf_2.sort_values(by = ['ID', 'yyyymm_2'])
mydf_2
# ------------------------------------------------
# ------------------------------------------------
# Backlog Dev Test
# ------------------------------------------------
# ------------------------------------------------
for index, row in mydf_2.iterrows():
print(row['ID'], row['yyyymm_2'])
# ------------------------------------------------
for index, row in mydf_2.iterrows():
individual_ID = row['ID']
individual_yyyymm_2 = row['yyyymm_2']
temp_data = mydf_1[mydf_1['ID'] == individual_ID]
temp_data['yyyymm_2'] = individual_yyyymm_2
temp_data['elapased_months'] = temp_data['yyyymm_2'].dt.to_period('M') - temp_data['yyyymm_1'].dt.to_period('M')
temp_data['temp_flag'] = np.where(temp_data['elapased_months'].isin([-2, -1, 0 , 1]), 1, 0)
temp_data
# ------------------------------------------------
for index, row in mydf_2.iterrows():
individual_ID = row['ID']
individual_yyyymm_2 = row['yyyymm_2']
temp_data = mydf_1[mydf_1['ID'] == individual_ID]
len(temp_data)
# ------------------------------------------------
df_final = pd.DataFrame()
for index, row in mydf_2.iterrows():
individual_ID = row['ID']
individual_yyyymm_2 = row['yyyymm_2']
temp_data = mydf_1[mydf_1['ID'] == individual_ID]
if len(temp_data) == 0:
df_final = df_final.append(row)
else:
temp_data['yyyymm_2'] = individual_yyyymm_2
temp_data['elapased_months'] = temp_data['yyyymm_2'].dt.to_period('M') - temp_data['yyyymm_1'].dt.to_period('M')
temp_data['temp_flag'] = np.where(temp_data['elapased_months'].isin([-2, -1, 0 , 1]), 1, 0)
if 1 in np.array(temp_data['temp_flag']):
continue
else:
df_final = df_final.append(row)
df_final
# ------------------------------------------------
df_final = pd.DataFrame()
for index, row in mydf_2.iterrows():
df_final.append(row)
df_final
# ------------------------------------------------
# ------------------------------------------------
# ------------------------------------------------
# Contributed Function
# ------------------------------------------------
# ------------------------------------------------
def mydf_2_search_in_mydf_1_pandas_way(data_to_loop):
'''
Signature: dataframe -> dataframe
Author: @Korkrid Akepanidtaworn
Description: Loop through the base table (table 1) against the master table (table 2)
# array([[1.0, Timestamp('2015-12-04 00:00:00')],
# [2.0, Timestamp('2016-03-13 00:00:00')],
# [2.0, Timestamp('2017-08-27 00:00:00')],
# [4.0, Timestamp('2015-12-23 00:00:00')],
# [5.0, Timestamp('2017-08-23 00:00:00')]], dtype=object)
'''
df_final = pd.DataFrame()
for index, row in data_to_loop.iterrows():
individual_ID = row['ID']
individual_yyyymm_2 = row['yyyymm_2']
temp_data = mydf_1[mydf_1['ID'] == individual_ID]
if len(temp_data) == 0:
df_final = df_final.append(row)
else:
temp_data['yyyymm_2'] = individual_yyyymm_2
temp_data['elapased_months'] = temp_data['yyyymm_2'].dt.to_period('M') - temp_data['yyyymm_1'].dt.to_period('M')
temp_data['temp_flag'] = np.where(temp_data['elapased_months'].isin([-2, -1, 0 , 1]), 1, 0)
if 1 in np.array(temp_data['temp_flag']):
continue
else:
df_final = df_final.append(row)
return df_final
mydf_3 = pd.DataFrame({'ID':random.choices(range(6), k=5),
'yyyymm_2':random_datetimes_or_dates(start, end, out_format='not datetime', n = 5)})
mydf_3 = mydf_3.sort_values(by = ['ID', 'yyyymm_2'])
mydf_3
mydf_2_search_in_mydf_1_pandas_way(mydf_3)
# array([[1.0, Timestamp('2015-12-04 00:00:00')],
# [2.0, Timestamp('2016-03-13 00:00:00')],
# [2.0, Timestamp('2017-08-27 00:00:00')],
# [4.0, Timestamp('2015-12-23 00:00:00')],
# [5.0, Timestamp('2017-08-23 00:00:00')]], dtype=object)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment