Last active
February 26, 2022 01:22
-
-
Save vimaloctavius/37d6b813592e6b854f959fc9ac33ccaf to your computer and use it in GitHub Desktop.
Data science 101
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df["Age"] = df["Age"]+1 | |
#Dealing with Nan values | |
df.dropna(subset=["Price"],axis=0,inpace=True) #same as below. axis=1 will drop columns | |
df=df.dropna(subset=["Price"],axis=0) #same as above | |
mean = df["Losses"].mean() | |
df["Losses"].replace(np.nan,mean) | |
#Data formatting | |
df["CityMPG"] = 235/df["CityMPG"] #Divide each column value by 235 | |
df.rename(columns={"CityMPG":"CityL/100km"},inplace=True) | |
#Convert Datatypes (Let's say the Price column got assigned as an Object | |
df["Price"].tail(5) | |
df.dtypes() | |
df["Price"] = df["Price"].astype("int") | |
#Data normalization so there's similar intrinsic influence on analytical model | |
df["Length"] = df["length"]/df["Length"].max() #Divide by Max, Simple Feature Scaling | |
df["Length"] = (df["length"]-df["Length"].min())/(df["Length"].max()-df["Length"].min())#Min Max | |
df["Length"] = (df["length"]-df["Length"].mean())/(df["Length"].std())# Z score, x-mu/sigma | |
#Binning | |
bins = np.linspace(min(df["Price"]),max(df["Price"]),4) | |
group_names = ["low","medium","high"] | |
df["Price-binned"] = pd.cut(df["Price"],bins,labels=group_names,include_lowest=True) | |
#"one-hot" encoding is converting categorical column values to 0 or 1 | |
pd.get_dummies(df["fuel"]#creates 2 columns with "gas" as 1 and "diesel" as 0 | |
# replace "?" to NaN | |
df.replace("?", np.nan, inplace = True) | |
df.head(5) | |
missing_data = df.isnull() | |
missing_data.head(5) | |
#Boxplot | |
sns.boxplot(x="drive-wheels",y="price",data=df) | |
#scatter plot | |
x=df["engine-size"] | |
y=df["price"] | |
plt.scatter(x,y) | |
plt.xlabel("size of the engine") | |
plt.ylabel("price of yhe car") | |
plt.title("scatter plot of engine and rice") | |
#groupby | |
df_test=df["drive-wheels","body-style","price"] | |
df_grp=df_test.groupby(["drive-wheels","body-style"],as_index=False).mean() | |
df_grp | |
#pivot table | |
df_piv=df_grp.pivot(index="drive-wheels",columns="body-style") | |
#pivot map | |
plt.pcolor(df_piv,cmap="RdBu") | |
plt.colorbar() | |
plt.show() | |
#Correlation | |
sns.regplot(x="engine-size",y="price",data=df) | |
plt.ylim(0,) | |
pearson_coef,p_value=stats.pearsonr(df["horse-power"],df["price"]) | |
#ANOVA | |
df_anova=df[["make","price"]] | |
grouped_anova=df_anova.groupby(["make"]) | |
anova_results_1=stats.f_oneway(grouped_anova.getgroup("honda")["price"],grouped_anova.getgroup("subaru")["price"]) | |
#regression | |
from sklearn.linear_model import LinearRegression | |
lm=LinearRegression() | |
x=df[["highwaympg"]] | |
y=df[["price"]] | |
lm.fit(x,y) # simple linear regression SLR | |
z=df[["horsepower","curbweight","enginesize","highwaympg"]] | |
lm.fit(z,df["price"]) #MLR | |
yhat=lm.predict(x) | |
lm.intercept_ | |
lm.coeff_ | |
#residual plot...if randomly spread along x axis,linear regression is correct | |
#if it has a curvature, non linear model is appropriate | |
#if its unevenly spread around x axis, model is incorrect | |
import seaborn as sns | |
sns.residplot(df["highwaympg"],["price"]) | |
#distribution plot | |
ax1=sns.distplot(df["price"],hist=False,color="r",label="Actual Value") | |
sns.distplot(Yhat,hist=False,color="b", label="fitted values",ax=ax1) | |
#polynomial regression of 3rd order | |
f=np.polyfit(x,y,3) | |
p=np.polyld(f) | |
print(p) | |
#for polynomial with more than 1 dimension | |
from sklesrn.preprocessing import PolynomialFestures | |
pr=PolynomialFeatures(degree=2, include_bias=False) | |
#using pipeline to perform 1.Normalization 2.Transformation 3.Preeiction ...simplifies the code | |
#mean squared error | |
from sklearn.metrics import mean_squared_error | |
mean_squared_error(df["price"],Y_predict_simple_fit) | |
#r squared value | |
x = df["highwaympg"] | |
y = df["price"] | |
lm.fit(x,y) | |
lm.score(x,y) | |
#if r square is negative it cud be due to over fitting | |
#training and test set split - model evaluation | |
from sklearn.model_selection import train_test_split | |
y_data = df["price"] | |
#x_data = features or independent variables | |
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=.3,random_state=0) | |
#cross validation | |
from sklearn.model_selection import cross_val_score | |
from sklearn.model_selection import cross_val_predict | |
scores=cross_val_score(lr,x_data,y_data,cv=3) | |
np.mean(scores) | |
yhat = cross_val_predict(lr2e,x_data,y_data,cv=3) | |
#ridge regression | |
from sklearn.linear_model import Ridge | |
RidgeModel = Ridge(alpha=.1) | |
RidgeModel.fit(X,y) | |
Yhat = RidgeModel.predict(X) | |
From:https://stackoverflow.com/questions/50302180/difference-between-dfx-dfx-dfx-dfx-and-df-x | |
df[x] — index a column using variable x. Returns pd.Series | |
df[[x]] — index/slice a single-column DataFrame using variable x. Returns pd.DataFrame | |
df['x'] — index a column named 'x'. Returns pd.Series | |
df[['x']] — index/slice a single-column DataFrame having only one column named 'x'. Returns pd.DataFrame | |
df.x — dot accessor notation, equivalent to df['x'] (there are, however, limitations on what x can be named if dot notation is to be successfully used). Returns pd.Series |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment