chrisdmell/bus_price_clean_5.py

## bus_price_clean_5.py
f = final_cleaned_df.copy()

b1 = f[(f["Bus"] == "a6951a59b64579edcf822ab9ea4c0c83") & (f["Service_Date"] == "15-07-2020 00:00")]
b2 = f[(f["Bus"] == "ab479dab4a9e6bc3eaefe77a09f027ed") & (f["Service_Date"] == "15-07-2020 00:00")]

recorded_dates_df = pd.concat([b1[["RecordedAt_new"]], b2[["RecordedAt_new"]]], axis = 0).drop_duplicates().sort_values(by = "RecordedAt_new").reset_index().drop(columns = "index")

joined_1  =  pd.merge(recorded_dates_df, b1, on=["RecordedAt_new"], how='left',suffixes=('_actuals', '_B1'))
joined_df =  pd.merge(joined_1, b2, on=["RecordedAt_new"], how='left',suffixes=('_B1', '_B2'))
joined_df

cols_to_keep = ["RecordedAt_new", "Service_Date_B1","Bus_B1","Bus_B2", "average_price_s1_s2_filled_B1", "average_price_s1_s2_filled_B2"]

model_df = joined_df[cols_to_keep]
model_df_2 = model_df.drop_duplicates()

## replace null of service date
model_df_2['Service_Date_B1'] = model_df_2['Service_Date_B1'].fillna(model_df_2['Service_Date_B1'].value_counts().idxmax())
model_df_2['Bus_B1'] = model_df_2['Bus_B1'].fillna(model_df_2['Bus_B1'].value_counts().idxmax())
model_df_2['Bus_B1'] = model_df_2['Bus_B1'].fillna(model_df_2['Bus_B1'].value_counts().idxmax())
model_df_2.fillna(0, inplace = True)
test_a = model_df_2.sort_values(by = ["RecordedAt_new" ])
test_a = test_a[["Service_Date_B1","average_price_s1_s2_filled_B1" ]]
test_a["average_price_B1_new"] = test_a.groupby(["Service_Date_B1" ]).transform(lambda x: x.replace(to_replace=0, method='bfill'))

test_f = model_df_2.sort_values(by = ["RecordedAt_new" ])
test_f = test_f[["Service_Date_B1","average_price_s1_s2_filled_B2" ]]
test_f["average_price_B2_new"] = test_f.groupby(["Service_Date_B1" ]).transform(lambda x: x.replace(to_replace=0, method='bfill'))

model_df_2["average_price_B1_new"] = test_a["average_price_B1_new"]
model_df_2["average_price_B2_new"] = test_f["average_price_B2_new"]
model_df_3 = model_df_2[model_df_2["average_price_B1_new"] != 0][["average_price_B1_new","average_price_B2_new"] ]

from scipy.stats import hmean
## get the price change wrt to each bus price
model_df_2["price_cng_b1"] = abs(model_df_2.average_price_B1_new - model_df_2.average_price_B2_new)/model_df_2.average_price_B1_new
model_df_2["price_cng_b2"] = abs(model_df_2.average_price_B1_new - model_df_2.average_price_B2_new)/model_df_2.average_price_B2_new
model_df_2["harm_mean_price_cng"] = scipy.stats.hmean(model_df_2.iloc[:,8:10],axis=1)
model_df_2 = model_df_2[model_df_2["average_price_B1_new"] != 0]
model_df_2 = model_df_2[model_df_2["average_price_B2_new"] != 0]

model_df_2x = model_df_2.copy()
hm = scipy.stats.hmean(model_df_2x.iloc[:,8:10],axis=1)
display((max(hm) - min(hm))/ min(hm))

print("======================================================================================================")
model_df_3 = model_df_2[model_df_2["average_price_B1_new"] != 0][["price_cng_b1","price_cng_b2"] ]
model_df_3.plot();
plt.show()
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
# (X,Y)
regr.fit(np.array(model_df_2["price_cng_b1"]).reshape(-1,1),np.array(model_df_2["price_cng_b2"]).reshape(-1,1))
# The coefficients
print("Coefficients: \n", regr.coef_)
	f = final_cleaned_df.copy()

	b1 = f[(f["Bus"] == "a6951a59b64579edcf822ab9ea4c0c83") & (f["Service_Date"] == "15-07-2020 00:00")]
	b2 = f[(f["Bus"] == "ab479dab4a9e6bc3eaefe77a09f027ed") & (f["Service_Date"] == "15-07-2020 00:00")]

	recorded_dates_df = pd.concat([b1[["RecordedAt_new"]], b2[["RecordedAt_new"]]], axis = 0).drop_duplicates().sort_values(by = "RecordedAt_new").reset_index().drop(columns = "index")

	joined_1 = pd.merge(recorded_dates_df, b1, on=["RecordedAt_new"], how='left',suffixes=('_actuals', '_B1'))
	joined_df = pd.merge(joined_1, b2, on=["RecordedAt_new"], how='left',suffixes=('_B1', '_B2'))
	joined_df

	cols_to_keep = ["RecordedAt_new", "Service_Date_B1","Bus_B1","Bus_B2", "average_price_s1_s2_filled_B1", "average_price_s1_s2_filled_B2"]

	model_df = joined_df[cols_to_keep]
	model_df_2 = model_df.drop_duplicates()

	## replace null of service date
	model_df_2['Service_Date_B1'] = model_df_2['Service_Date_B1'].fillna(model_df_2['Service_Date_B1'].value_counts().idxmax())
	model_df_2['Bus_B1'] = model_df_2['Bus_B1'].fillna(model_df_2['Bus_B1'].value_counts().idxmax())
	model_df_2['Bus_B1'] = model_df_2['Bus_B1'].fillna(model_df_2['Bus_B1'].value_counts().idxmax())
	model_df_2.fillna(0, inplace = True)
	test_a = model_df_2.sort_values(by = ["RecordedAt_new" ])
	test_a = test_a[["Service_Date_B1","average_price_s1_s2_filled_B1" ]]
	test_a["average_price_B1_new"] = test_a.groupby(["Service_Date_B1" ]).transform(lambda x: x.replace(to_replace=0, method='bfill'))

	test_f = model_df_2.sort_values(by = ["RecordedAt_new" ])
	test_f = test_f[["Service_Date_B1","average_price_s1_s2_filled_B2" ]]
	test_f["average_price_B2_new"] = test_f.groupby(["Service_Date_B1" ]).transform(lambda x: x.replace(to_replace=0, method='bfill'))

	model_df_2["average_price_B1_new"] = test_a["average_price_B1_new"]
	model_df_2["average_price_B2_new"] = test_f["average_price_B2_new"]
	model_df_3 = model_df_2[model_df_2["average_price_B1_new"] != 0][["average_price_B1_new","average_price_B2_new"] ]

	from scipy.stats import hmean
	## get the price change wrt to each bus price
	model_df_2["price_cng_b1"] = abs(model_df_2.average_price_B1_new - model_df_2.average_price_B2_new)/model_df_2.average_price_B1_new
	model_df_2["price_cng_b2"] = abs(model_df_2.average_price_B1_new - model_df_2.average_price_B2_new)/model_df_2.average_price_B2_new
	model_df_2["harm_mean_price_cng"] = scipy.stats.hmean(model_df_2.iloc[:,8:10],axis=1)
	model_df_2 = model_df_2[model_df_2["average_price_B1_new"] != 0]
	model_df_2 = model_df_2[model_df_2["average_price_B2_new"] != 0]

	model_df_2x = model_df_2.copy()
	hm = scipy.stats.hmean(model_df_2x.iloc[:,8:10],axis=1)
	display((max(hm) - min(hm))/ min(hm))

	print("======================================================================================================")
	model_df_3 = model_df_2[model_df_2["average_price_B1_new"] != 0][["price_cng_b1","price_cng_b2"] ]
	model_df_3.plot();
	plt.show()
	# Create linear regression object
	regr = linear_model.LinearRegression()
	# Train the model using the training sets
	# (X,Y)
	regr.fit(np.array(model_df_2["price_cng_b1"]).reshape(-1,1),np.array(model_df_2["price_cng_b2"]).reshape(-1,1))
	# The coefficients
	print("Coefficients: \n", regr.coef_)