-
-
Save AkiyonKS/42404399c3b0ddbeab35f5df56116b2b to your computer and use it in GitHub Desktop.
fix wrong records of trains
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import os | |
import shutil | |
# trains.csvを読み込み | |
filename = "trains" | |
file_path = "../csv/" + filename + ".csv" | |
df = pd.read_csv(file_path) | |
# 無名列がある場合には削除 | |
if df.columns[[0]] != 'photo_id': | |
df = df.drop(columns=df.columns[[0]]) | |
# photo_idをintに変換 | |
df['photo_id'] = list(map(int, df['photo_id'])) | |
# 修正情報を記載したcsvを読み込む | |
filename2 = "fix_wrong_records2" | |
file_path2 = '../csv/' + filename2 + '.csv' | |
df2 = pd.read_csv(file_path2) | |
# 無名列がある場合には削除 | |
if df2.columns[[0]] != 'check': | |
df2 = df2.drop(columns=df2.columns[[0]]) | |
# checkをintに変換して1の行を抽出 | |
df2['check'] = list(map(int, df2['check'])) | |
df2_r = df2.loc[df2.check == 1] | |
# バックアップとっておく | |
if os.path.isfile(file_path): | |
shutil.copy2(file_path, '../csv/' + filename + '_backup.csv') | |
if os.path.isfile(file_path2): | |
shutil.copy2(file_path2, "../csv/" + filename2 + "_backup.csv") | |
# 変更状況を記録するための変数を定義 | |
col_names_modified = [] # 変更した値の列の名前 | |
check = 0 # 変更した数 | |
# for文で繰り返し処理 | |
for i in range(len(df2_r)): | |
row0 = df2_r[i:i+1] | |
row = row0.iloc[0] | |
print(row) | |
check += 1 | |
if row.col_name in df.columns: | |
print(str(row.col_name)) | |
elif row.col_name == "keishiki_company": | |
# "形式"_"鉄道会社名"が変更する列の値だった場合に"keishiki_company"列を追加 | |
print("add column keishiki_company") | |
df["keishiki_company"] = list(df.apply(lambda x: "_".join([str(x["形式"]), x.company_name]), axis=1)) | |
# 変更したい値を含む行を抽出 | |
if row.match_type == 'partial': | |
# 部分一致の場合 | |
df_r = df.loc[list(map(lambda x: row.value in str(x), df[row.col_name]))] | |
else: | |
# 完全一致の場合 | |
df_r = df.loc[df[row.col_name] == row.value] | |
print(len(df_r)) | |
if len(df_r) > 0: | |
col_names_modified.append(row.col_name) | |
# 値をvalueからvalue_rに変更 | |
if row.match_type == 'partial': | |
# 部分一致の場合 | |
df_r.loc[:, row.col_name] = list(map(lambda x: x.replace(row.value, row.value_r), df_r[row.col_name])) | |
else: | |
# 完全一致の場合 | |
df_r.loc[:, row.col_name] = row.value_r | |
# 変更した値が他の列に影響を及ぼす場合、その列も変更 | |
if row.col_name in ["形式", "company_name", "keishiki_company"]: | |
col_names_modified.append("label") | |
if row.col_name == "keishiki_company": | |
col_names_modified.append("形式") | |
df_r.loc[:, "形式"] = list(map(lambda x: x.split("_")[0], df_r['keishiki_company'])) | |
df_r2 = df_r.loc[list(map(lambda x: row.value in str(x), df_r["label"]))] | |
df_r2.loc[:, "label"] = list(map(lambda x: str(x).replace(row.value, row.value_r), df_r["label"])) | |
df_r.update(df_r2) # 値を上書き | |
df.update(df_r) # 値を上書き | |
# 一時的に必要だった列を削除 | |
if "keishiki_company" in df.columns: | |
df.drop(columns=["keishiki_company"]) | |
row0.check = int(9) # 処理が終わったのでcheckを9にする | |
df2_r.update(row0) # 値を上書き | |
df2.update(df2_r) # 値を上書き | |
df2.to_csv(file_path2, index=False) # 更新された修正情報をcsvファイルに保存 | |
# checkを文字列に変換 | |
df2['check'] = list(map(lambda x: str(int(x)), df2['check'])) | |
# 修正情報をcsvファイルに保存 | |
df2.to_csv(file_path2, index=False) | |
if check > 0: | |
# 修正箇所があった場合 | |
# 並び変えしてインデックスつけなおし | |
sort_arr = ['label','形式','company_name','photo_id'] | |
df = df.sort_values(sort_arr).reset_index().drop(columns="index") | |
# photo_idを文字列に変換 | |
df["photo_id"] = list(map(lambda x: str(int(x)), df["photo_id"])) | |
df.to_csv(file_path) # csvファイルに保存 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment