Skip to content

Instantly share code, notes, and snippets.

@jeremyko
Last active August 16, 2023 09:20
Show Gist options
  • Save jeremyko/b92399a94dd01db3cf5499a3a2b5afc4 to your computer and use it in GitHub Desktop.
Save jeremyko/b92399a94dd01db3cf5499a3a2b5afc4 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import sys
import time
loop_result = []
apply_result = []
# //////////////////////////////////////////////////////////////////////////////
df_big = pd.DataFrame(np.random.random_sample((5000, 2)), columns=["X", "Y"])
df_small = pd.DataFrame(np.random.random_sample((50, 3)), columns=["x_val", "y_val", "dummy"])
# 비교 테스트를 위해 동일한 데이터를 미리 복사해서 준비
df_big_copy = df_big.copy()
df_small_copy = df_small.copy()
# //////////////////////////////////////////////////////////////////////////////
def for_loop():
start = time.time()
for index_small in df_small.index:
list_distance = []
for index_big in df_big.index:
distance = (df_small.loc[index_small, "x_val"] - df_big.loc[index_big, "X"]) ** 2 + (
df_small.loc[index_small, "y_val"] - df_big.loc[index_big, "Y"]
) ** 2
list_distance.append(distance)
if len(list_distance) > 0:
min_index = list_distance.index(min(list_distance))
loop_result.append(min_index)
end = time.time()
print("elapsed (for loop) = ", end - start)
return end - start
# //////////////////////////////////////////////////////////////////////////////
def for_loop_iterrows():
start = time.time()
for _, small_row in df_small.iterrows():
list_distance = []
for _, big_row in df_big.iterrows():
distance = (small_row["x_val"] - big_row["X"]) ** 2 + (small_row["y_val"] - big_row["Y"]) ** 2
list_distance.append(distance)
if len(list_distance) > 0:
min_index = list_distance.index(min(list_distance))
loop_result.append(min_index)
end = time.time()
print("elapsed (for loop) = ", end - start)
return end - start
# //////////////////////////////////////////////////////////////////////////////
def for_loop_iterrows_at():
start = time.time()
for index_small, _ in df_small.iterrows():
list_distance = []
for index_big, _ in df_big.iterrows():
distance = (df_small.at[index_small, "x_val"] - df_big.at[index_big, "X"]) ** 2 + (
df_small.at[index_small, "y_val"] - df_big.at[index_big, "Y"]
) ** 2
list_distance.append(distance)
if len(list_distance) > 0:
min_index = list_distance.index(min(list_distance))
loop_result.append(min_index)
end = time.time()
print("elapsed (for loop) = ", end - start)
return end - start
# //////////////////////////////////////////////////////////////////////////////
def for_loop_at():
start = time.time()
for index_small in df_small.index:
list_distance = []
for index_big in df_big.index:
distance = (df_small.at[index_small, "x_val"] - df_big.at[index_big, "X"]) ** 2 + (
df_small.at[index_small, "y_val"] - df_big.at[index_big, "Y"]
) ** 2
list_distance.append(distance)
if len(list_distance) > 0:
min_index = list_distance.index(min(list_distance))
loop_result.append(min_index)
end = time.time()
print("elapsed (for loop) = ", end - start)
return end - start
# //////////////////////////////////////////////////////////////////////////////
def for_loop_vectorization():
start = time.time()
for index_small in df_small.index:
list_distance = (df_small.at[index_small, "x_val"] - df_big["X"]) ** 2 + (
df_small.at[index_small, "y_val"] - df_big["Y"]
) ** 2
if len(list_distance) > 0:
min_index = list_distance.argmin()
loop_result.append(min_index)
end = time.time()
print("elapsed (for loop) = ", end - start)
return end - start
# //////////////////////////////////////////////////////////////////////////////
def for_loop_vectorization_numpy():
start = time.time()
for index_small in df_small.index:
list_distance = (df_small.at[index_small, "x_val"] - df_big["X"].to_numpy()) ** 2 + (
df_small.at[index_small, "y_val"] - df_big["Y"].to_numpy()
) ** 2
if len(list_distance) > 0:
min_index = list_distance.argmin()
loop_result.append(min_index)
end = time.time()
print("elapsed (for loop) = ", end - start)
return end - start
# //////////////////////////////////////////////////////////////////////////////
def apply_test():
start = time.time()
df_small_copy.apply(get_distance, axis=1, args=[df_big_copy])
end = time.time()
print("elapsed (apply) = ", end - start)
return end - start
# //////////////////////////////////////////////////////////////////////////////
def get_distance(small_row, arg_df_big):
list_distance = (small_row["x_val"] - arg_df_big["X"].to_numpy()) ** 2 + (
small_row["y_val"] - arg_df_big["Y"].to_numpy()
) ** 2
if len(list_distance) > 0:
min_index = list_distance.argmin()
apply_result.append(min_index)
# //////////////////////////////////////////////////////////////////////////////
# //////////////////////////////////////////////////////////////////////////////
def main(argv):
loop_elapsed = for_loop()
# loop_elapsed = for_loop_iterrows()
# loop_elapsed = for_loop_iterrows_at()
# loop_elapsed = for_loop_at()
# loop_elapsed = for_loop_vectorization()
# loop_elapsed = for_loop_vectorization_numpy()
apply_elapsed = apply_test()
print("speed ratio = ", loop_elapsed / apply_elapsed)
# print("\n")
# print("loop result ---> ", loop_result)
# print("apply result ---> ", apply_result)
# //////////////////////////////////////////////////////////////////////////////
if __name__ == "__main__":
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment