vimaloctavius/DataSciencePython

## DataSciencePython
df["Age"] = df["Age"]+1

#Dealing with Nan values
df.dropna(subset=["Price"],axis=0,inpace=True) #same as below. axis=1 will drop columns
df=df.dropna(subset=["Price"],axis=0) #same as above

mean = df["Losses"].mean()
df["Losses"].replace(np.nan,mean)

#Data formatting
df["CityMPG"] = 235/df["CityMPG"] #Divide each column value by 235
df.rename(columns={"CityMPG":"CityL/100km"},inplace=True)

#Convert Datatypes (Let's say the Price column got assigned as an Object
df["Price"].tail(5)
df.dtypes()
df["Price"] = df["Price"].astype("int")

#Data normalization so there's similar intrinsic influence on analytical model
df["Length"] = df["length"]/df["Length"].max() #Divide by Max, Simple Feature Scaling
df["Length"] = (df["length"]-df["Length"].min())/(df["Length"].max()-df["Length"].min())#Min Max
df["Length"] = (df["length"]-df["Length"].mean())/(df["Length"].std())# Z score, x-mu/sigma

#Binning
bins = np.linspace(min(df["Price"]),max(df["Price"]),4)
group_names = ["low","medium","high"]
df["Price-binned"] = pd.cut(df["Price"],bins,labels=group_names,include_lowest=True)

#"one-hot" encoding is converting categorical column values to 0 or 1
pd.get_dummies(df["fuel"]#creates 2 columns with "gas" as 1 and "diesel" as 0

# replace "?" to NaN
df.replace("?", np.nan, inplace = True)
df.head(5)

missing_data = df.isnull()
missing_data.head(5)

#Boxplot
sns.boxplot(x="drive-wheels",y="price",data=df)

#scatter plot
x=df["engine-size"]
y=df["price"]
plt.scatter(x,y)
plt.xlabel("size of the engine")
plt.ylabel("price of yhe car")
plt.title("scatter plot of engine and rice")

#groupby
df_test=df["drive-wheels","body-style","price"]
df_grp=df_test.groupby(["drive-wheels","body-style"],as_index=False).mean()
df_grp
#pivot table
df_piv=df_grp.pivot(index="drive-wheels",columns="body-style")

#pivot map
plt.pcolor(df_piv,cmap="RdBu")
plt.colorbar()
plt.show()

#Correlation
sns.regplot(x="engine-size",y="price",data=df)
plt.ylim(0,)

pearson_coef,p_value=stats.pearsonr(df["horse-power"],df["price"])

#ANOVA
df_anova=df[["make","price"]]
grouped_anova=df_anova.groupby(["make"])
anova_results_1=stats.f_oneway(grouped_anova.getgroup("honda")["price"],grouped_anova.getgroup("subaru")["price"])

#regression
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
x=df[["highwaympg"]]
y=df[["price"]]
lm.fit(x,y) # simple linear regression SLR
z=df[["horsepower","curbweight","enginesize","highwaympg"]]
lm.fit(z,df["price"]) #MLR
yhat=lm.predict(x)
lm.intercept_
lm.coeff_

#residual plot...if randomly spread along x axis,linear regression is correct
#if it has a curvature, non linear model is appropriate
#if its unevenly spread around x axis, model is incorrect
import seaborn as sns
sns.residplot(df["highwaympg"],["price"])

#distribution plot
ax1=sns.distplot(df["price"],hist=False,color="r",label="Actual Value")
sns.distplot(Yhat,hist=False,color="b", label="fitted values",ax=ax1)

#polynomial regression of 3rd order
f=np.polyfit(x,y,3)
p=np.polyld(f)
print(p)

#for polynomial with more than 1 dimension
from sklesrn.preprocessing import PolynomialFestures
pr=PolynomialFeatures(degree=2, include_bias=False)

#using pipeline to perform 1.Normalization 2.Transformation 3.Preeiction ...simplifies the code

#mean squared error
from sklearn.metrics import mean_squared_error
mean_squared_error(df["price"],Y_predict_simple_fit)

#r squared value
x = df["highwaympg"]
y = df["price"]
lm.fit(x,y)
lm.score(x,y)
#if r square is negative it cud be due to over fitting

#training and test set split - model evaluation
from sklearn.model_selection import train_test_split
y_data = df["price"]
#x_data = features or independent variables
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=.3,random_state=0)

#cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
scores=cross_val_score(lr,x_data,y_data,cv=3)
np.mean(scores)
yhat = cross_val_predict(lr2e,x_data,y_data,cv=3)

#ridge regression
from sklearn.linear_model import Ridge
RidgeModel = Ridge(alpha=.1)
RidgeModel.fit(X,y)
Yhat = RidgeModel.predict(X)

From:https://stackoverflow.com/questions/50302180/difference-between-dfx-dfx-dfx-dfx-and-df-x
df[x] — index a column using variable x. Returns pd.Series
df[[x]] — index/slice a single-column DataFrame using variable x. Returns pd.DataFrame
df['x'] — index a column named 'x'. Returns pd.Series
df[['x']] — index/slice a single-column DataFrame having only one column named 'x'. Returns pd.DataFrame
df.x — dot accessor notation, equivalent to df['x'] (there are, however, limitations on what x can be named if dot notation is to be successfully used). Returns pd.Series
	df["Age"] = df["Age"]+1

	#Dealing with Nan values
	df.dropna(subset=["Price"],axis=0,inpace=True) #same as below. axis=1 will drop columns
	df=df.dropna(subset=["Price"],axis=0) #same as above

	mean = df["Losses"].mean()
	df["Losses"].replace(np.nan,mean)

	#Data formatting
	df["CityMPG"] = 235/df["CityMPG"] #Divide each column value by 235
	df.rename(columns={"CityMPG":"CityL/100km"},inplace=True)

	#Convert Datatypes (Let's say the Price column got assigned as an Object
	df["Price"].tail(5)
	df.dtypes()
	df["Price"] = df["Price"].astype("int")

	#Data normalization so there's similar intrinsic influence on analytical model
	df["Length"] = df["length"]/df["Length"].max() #Divide by Max, Simple Feature Scaling
	df["Length"] = (df["length"]-df["Length"].min())/(df["Length"].max()-df["Length"].min())#Min Max
	df["Length"] = (df["length"]-df["Length"].mean())/(df["Length"].std())# Z score, x-mu/sigma

	#Binning
	bins = np.linspace(min(df["Price"]),max(df["Price"]),4)
	group_names = ["low","medium","high"]
	df["Price-binned"] = pd.cut(df["Price"],bins,labels=group_names,include_lowest=True)

	#"one-hot" encoding is converting categorical column values to 0 or 1
	pd.get_dummies(df["fuel"]#creates 2 columns with "gas" as 1 and "diesel" as 0

	# replace "?" to NaN
	df.replace("?", np.nan, inplace = True)
	df.head(5)

	missing_data = df.isnull()
	missing_data.head(5)

	#Boxplot
	sns.boxplot(x="drive-wheels",y="price",data=df)

	#scatter plot
	x=df["engine-size"]
	y=df["price"]
	plt.scatter(x,y)
	plt.xlabel("size of the engine")
	plt.ylabel("price of yhe car")
	plt.title("scatter plot of engine and rice")

	#groupby
	df_test=df["drive-wheels","body-style","price"]
	df_grp=df_test.groupby(["drive-wheels","body-style"],as_index=False).mean()
	df_grp
	#pivot table
	df_piv=df_grp.pivot(index="drive-wheels",columns="body-style")

	#pivot map
	plt.pcolor(df_piv,cmap="RdBu")
	plt.colorbar()
	plt.show()

	#Correlation
	sns.regplot(x="engine-size",y="price",data=df)
	plt.ylim(0,)

	pearson_coef,p_value=stats.pearsonr(df["horse-power"],df["price"])

	#ANOVA
	df_anova=df[["make","price"]]
	grouped_anova=df_anova.groupby(["make"])
	anova_results_1=stats.f_oneway(grouped_anova.getgroup("honda")["price"],grouped_anova.getgroup("subaru")["price"])

	#regression
	from sklearn.linear_model import LinearRegression
	lm=LinearRegression()
	x=df[["highwaympg"]]
	y=df[["price"]]
	lm.fit(x,y) # simple linear regression SLR
	z=df[["horsepower","curbweight","enginesize","highwaympg"]]
	lm.fit(z,df["price"]) #MLR
	yhat=lm.predict(x)
	lm.intercept_
	lm.coeff_

	#residual plot...if randomly spread along x axis,linear regression is correct
	#if it has a curvature, non linear model is appropriate
	#if its unevenly spread around x axis, model is incorrect
	import seaborn as sns
	sns.residplot(df["highwaympg"],["price"])

	#distribution plot
	ax1=sns.distplot(df["price"],hist=False,color="r",label="Actual Value")
	sns.distplot(Yhat,hist=False,color="b", label="fitted values",ax=ax1)

	#polynomial regression of 3rd order
	f=np.polyfit(x,y,3)
	p=np.polyld(f)
	print(p)

	#for polynomial with more than 1 dimension
	from sklesrn.preprocessing import PolynomialFestures
	pr=PolynomialFeatures(degree=2, include_bias=False)

	#using pipeline to perform 1.Normalization 2.Transformation 3.Preeiction ...simplifies the code

	#mean squared error
	from sklearn.metrics import mean_squared_error
	mean_squared_error(df["price"],Y_predict_simple_fit)

	#r squared value
	x = df["highwaympg"]
	y = df["price"]
	lm.fit(x,y)
	lm.score(x,y)
	#if r square is negative it cud be due to over fitting

	#training and test set split - model evaluation
	from sklearn.model_selection import train_test_split
	y_data = df["price"]
	#x_data = features or independent variables
	x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=.3,random_state=0)

	#cross validation
	from sklearn.model_selection import cross_val_score
	from sklearn.model_selection import cross_val_predict
	scores=cross_val_score(lr,x_data,y_data,cv=3)
	np.mean(scores)
	yhat = cross_val_predict(lr2e,x_data,y_data,cv=3)

	#ridge regression
	from sklearn.linear_model import Ridge
	RidgeModel = Ridge(alpha=.1)
	RidgeModel.fit(X,y)
	Yhat = RidgeModel.predict(X)

	From:https://stackoverflow.com/questions/50302180/difference-between-dfx-dfx-dfx-dfx-and-df-x
	df[x] — index a column using variable x. Returns pd.Series
	df[[x]] — index/slice a single-column DataFrame using variable x. Returns pd.DataFrame
	df['x'] — index a column named 'x'. Returns pd.Series
	df[['x']] — index/slice a single-column DataFrame having only one column named 'x'. Returns pd.DataFrame
	df.x — dot accessor notation, equivalent to df['x'] (there are, however, limitations on what x can be named if dot notation is to be successfully used). Returns pd.Series