Last active
August 16, 2023 09:20
-
-
Save jeremyko/b92399a94dd01db3cf5499a3a2b5afc4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import sys | |
import time | |
loop_result = [] | |
apply_result = [] | |
# ////////////////////////////////////////////////////////////////////////////// | |
df_big = pd.DataFrame(np.random.random_sample((5000, 2)), columns=["X", "Y"]) | |
df_small = pd.DataFrame(np.random.random_sample((50, 3)), columns=["x_val", "y_val", "dummy"]) | |
# 비교 테스트를 위해 동일한 데이터를 미리 복사해서 준비 | |
df_big_copy = df_big.copy() | |
df_small_copy = df_small.copy() | |
# ////////////////////////////////////////////////////////////////////////////// | |
def for_loop(): | |
start = time.time() | |
for index_small in df_small.index: | |
list_distance = [] | |
for index_big in df_big.index: | |
distance = (df_small.loc[index_small, "x_val"] - df_big.loc[index_big, "X"]) ** 2 + ( | |
df_small.loc[index_small, "y_val"] - df_big.loc[index_big, "Y"] | |
) ** 2 | |
list_distance.append(distance) | |
if len(list_distance) > 0: | |
min_index = list_distance.index(min(list_distance)) | |
loop_result.append(min_index) | |
end = time.time() | |
print("elapsed (for loop) = ", end - start) | |
return end - start | |
# ////////////////////////////////////////////////////////////////////////////// | |
def for_loop_iterrows(): | |
start = time.time() | |
for _, small_row in df_small.iterrows(): | |
list_distance = [] | |
for _, big_row in df_big.iterrows(): | |
distance = (small_row["x_val"] - big_row["X"]) ** 2 + (small_row["y_val"] - big_row["Y"]) ** 2 | |
list_distance.append(distance) | |
if len(list_distance) > 0: | |
min_index = list_distance.index(min(list_distance)) | |
loop_result.append(min_index) | |
end = time.time() | |
print("elapsed (for loop) = ", end - start) | |
return end - start | |
# ////////////////////////////////////////////////////////////////////////////// | |
def for_loop_iterrows_at(): | |
start = time.time() | |
for index_small, _ in df_small.iterrows(): | |
list_distance = [] | |
for index_big, _ in df_big.iterrows(): | |
distance = (df_small.at[index_small, "x_val"] - df_big.at[index_big, "X"]) ** 2 + ( | |
df_small.at[index_small, "y_val"] - df_big.at[index_big, "Y"] | |
) ** 2 | |
list_distance.append(distance) | |
if len(list_distance) > 0: | |
min_index = list_distance.index(min(list_distance)) | |
loop_result.append(min_index) | |
end = time.time() | |
print("elapsed (for loop) = ", end - start) | |
return end - start | |
# ////////////////////////////////////////////////////////////////////////////// | |
def for_loop_at(): | |
start = time.time() | |
for index_small in df_small.index: | |
list_distance = [] | |
for index_big in df_big.index: | |
distance = (df_small.at[index_small, "x_val"] - df_big.at[index_big, "X"]) ** 2 + ( | |
df_small.at[index_small, "y_val"] - df_big.at[index_big, "Y"] | |
) ** 2 | |
list_distance.append(distance) | |
if len(list_distance) > 0: | |
min_index = list_distance.index(min(list_distance)) | |
loop_result.append(min_index) | |
end = time.time() | |
print("elapsed (for loop) = ", end - start) | |
return end - start | |
# ////////////////////////////////////////////////////////////////////////////// | |
def for_loop_vectorization(): | |
start = time.time() | |
for index_small in df_small.index: | |
list_distance = (df_small.at[index_small, "x_val"] - df_big["X"]) ** 2 + ( | |
df_small.at[index_small, "y_val"] - df_big["Y"] | |
) ** 2 | |
if len(list_distance) > 0: | |
min_index = list_distance.argmin() | |
loop_result.append(min_index) | |
end = time.time() | |
print("elapsed (for loop) = ", end - start) | |
return end - start | |
# ////////////////////////////////////////////////////////////////////////////// | |
def for_loop_vectorization_numpy(): | |
start = time.time() | |
for index_small in df_small.index: | |
list_distance = (df_small.at[index_small, "x_val"] - df_big["X"].to_numpy()) ** 2 + ( | |
df_small.at[index_small, "y_val"] - df_big["Y"].to_numpy() | |
) ** 2 | |
if len(list_distance) > 0: | |
min_index = list_distance.argmin() | |
loop_result.append(min_index) | |
end = time.time() | |
print("elapsed (for loop) = ", end - start) | |
return end - start | |
# ////////////////////////////////////////////////////////////////////////////// | |
def apply_test(): | |
start = time.time() | |
df_small_copy.apply(get_distance, axis=1, args=[df_big_copy]) | |
end = time.time() | |
print("elapsed (apply) = ", end - start) | |
return end - start | |
# ////////////////////////////////////////////////////////////////////////////// | |
def get_distance(small_row, arg_df_big): | |
list_distance = (small_row["x_val"] - arg_df_big["X"].to_numpy()) ** 2 + ( | |
small_row["y_val"] - arg_df_big["Y"].to_numpy() | |
) ** 2 | |
if len(list_distance) > 0: | |
min_index = list_distance.argmin() | |
apply_result.append(min_index) | |
# ////////////////////////////////////////////////////////////////////////////// | |
# ////////////////////////////////////////////////////////////////////////////// | |
def main(argv): | |
loop_elapsed = for_loop() | |
# loop_elapsed = for_loop_iterrows() | |
# loop_elapsed = for_loop_iterrows_at() | |
# loop_elapsed = for_loop_at() | |
# loop_elapsed = for_loop_vectorization() | |
# loop_elapsed = for_loop_vectorization_numpy() | |
apply_elapsed = apply_test() | |
print("speed ratio = ", loop_elapsed / apply_elapsed) | |
# print("\n") | |
# print("loop result ---> ", loop_result) | |
# print("apply result ---> ", apply_result) | |
# ////////////////////////////////////////////////////////////////////////////// | |
if __name__ == "__main__": | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment