Skip to content

Instantly share code, notes, and snippets.

@AkiyonKS
Created August 9, 2022 02:08
Show Gist options
  • Save AkiyonKS/d42d89c804f96bac2f7634435bb00078 to your computer and use it in GitHub Desktop.
Save AkiyonKS/d42d89c804f96bac2f7634435bb00078 to your computer and use it in GitHub Desktop.
fetch collections of trains
import pandas as pd
import collections
import os
# 関数定義ここから
# 辞書型データをDataFrameに変換
def edit_dict(dict):
df = pd.DataFrame.from_dict(dict, orient='index').reset_index()
# 列名を変更
df = df.rename(columns={'index': 'value', 0: 'count'})
# valueでソート後、countとvalueを抽出
df = df.sort_values("value").reset_index().drop('index', axis=1).reindex(columns=["count","value"])
# 修正情報用の列を追加
df['value_r'] = '' * len(df)
return df
# 列(col_name)の値がvalueの場合のcol_name2の値を抽出、重複を削除して","で結合してreturn
def fetch_column_values(value, col_name, trains, col_name2):
values = trains.loc[trains[col_name] == value][col_name2]
values = set(values)
if col_name2 == 'photo_id':
values = list(map(int, values))
values.sort()
values = list(map(str, values))
tmp = ','.join(values)
return tmp
# 列名(複数可能)を指定して要素、カウント数の情報を取得、csvファイルに保存
# col_names = ["label", "編成","路線","車両","形式","愛称","company_name"]
def fetch_collections_trains(df, col_names):
# dfは車両情報のデータフレーム (trains.csv)
# 列名の重複を削除
col_names2 = list(set(df.columns) & set(col_names))
# データフレームからcol_name2を抽出してテキストに変換
df_r = df.reindex(columns=col_names2)
df_r = df_r.applymap(str)
# collections.Counterで要素とカウント数を取得(辞書型)
tmp = list(df_r.apply(collections.Counter, axis=0))
# 辞書型をedit_dictでデータフレームに変換
tmp2 = list(map(edit_dict, tmp))
# company_name(鉄道会社名)とphoto_idの情報を追加
for i, v in enumerate(col_names2):
print(v)
df2 = tmp2[i]
if v not in ["company_name", "keishiki_company"]:
tmp2[i]['company_name'] = list(map(lambda x: fetch_column_values(x, v, df, "company_name"), df2['value']))
tmp2[i]['photo_id'] = list(map(lambda x: fetch_column_values(x, v, df, "photo_id"), df2['value']))
# 結果を辞書型にして名前をつける
tmp3 = dict(zip(col_names2, tmp2))
return tmp3
def save_collections_data(dict_df, save_file_add_name=""):
# 保存するファイル名に使用する情報を記載
fnames = {
"label": "label",
"keishiki_company": "keishiki_company",
"company_keishiki": "company_keishiki",
"company_name": "company_name",
"形式": "keishiki",
"愛称": "aishou",
"編成": "hensei",
"路線": "rosen",
"車両": "syaryou",
}
# csvファイルに保存
# "形式"の場合, collections_keishiki.csvというファイル名になる
for v in dict_df.keys():
path = '../csv/collections_' + fnames[v] + save_file_add_name + '.csv'
if os.path.isfile(path):
os.rename(path, '../csv/collections_' + fnames[v] + save_file_add_name + '_backup.csv')
dict_df[v].to_csv(path, index=False)
# 関数定義ここまで
# 以下実行部分
# 車両情報のcsvファイルを読み込む
file_path = "../csv/trains.csv"
df = pd.read_csv(file_path)
# 無名の列を読み込んだ場合に削除
if df.columns[[0]] != 'photo_id':
df = df.drop(columns=df.columns[[0]])
# 複数の列に対して要素数とカウント数の情報を取得し、csvデータに保存
col_names = ["label", "company_name","形式", "編成","路線","車両","愛称"]
tmp = fetch_collections_trains(df, col_names)
save_collections_data(tmp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment