Skip to content

Instantly share code, notes, and snippets.

@vimaloctavius
Last active February 26, 2022 01:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vimaloctavius/37d6b813592e6b854f959fc9ac33ccaf to your computer and use it in GitHub Desktop.
Save vimaloctavius/37d6b813592e6b854f959fc9ac33ccaf to your computer and use it in GitHub Desktop.
Data science 101
df["Age"] = df["Age"]+1
#Dealing with Nan values
df.dropna(subset=["Price"],axis=0,inpace=True) #same as below. axis=1 will drop columns
df=df.dropna(subset=["Price"],axis=0) #same as above
mean = df["Losses"].mean()
df["Losses"].replace(np.nan,mean)
#Data formatting
df["CityMPG"] = 235/df["CityMPG"] #Divide each column value by 235
df.rename(columns={"CityMPG":"CityL/100km"},inplace=True)
#Convert Datatypes (Let's say the Price column got assigned as an Object
df["Price"].tail(5)
df.dtypes()
df["Price"] = df["Price"].astype("int")
#Data normalization so there's similar intrinsic influence on analytical model
df["Length"] = df["length"]/df["Length"].max() #Divide by Max, Simple Feature Scaling
df["Length"] = (df["length"]-df["Length"].min())/(df["Length"].max()-df["Length"].min())#Min Max
df["Length"] = (df["length"]-df["Length"].mean())/(df["Length"].std())# Z score, x-mu/sigma
#Binning
bins = np.linspace(min(df["Price"]),max(df["Price"]),4)
group_names = ["low","medium","high"]
df["Price-binned"] = pd.cut(df["Price"],bins,labels=group_names,include_lowest=True)
#"one-hot" encoding is converting categorical column values to 0 or 1
pd.get_dummies(df["fuel"]#creates 2 columns with "gas" as 1 and "diesel" as 0
# replace "?" to NaN
df.replace("?", np.nan, inplace = True)
df.head(5)
missing_data = df.isnull()
missing_data.head(5)
#Boxplot
sns.boxplot(x="drive-wheels",y="price",data=df)
#scatter plot
x=df["engine-size"]
y=df["price"]
plt.scatter(x,y)
plt.xlabel("size of the engine")
plt.ylabel("price of yhe car")
plt.title("scatter plot of engine and rice")
#groupby
df_test=df["drive-wheels","body-style","price"]
df_grp=df_test.groupby(["drive-wheels","body-style"],as_index=False).mean()
df_grp
#pivot table
df_piv=df_grp.pivot(index="drive-wheels",columns="body-style")
#pivot map
plt.pcolor(df_piv,cmap="RdBu")
plt.colorbar()
plt.show()
#Correlation
sns.regplot(x="engine-size",y="price",data=df)
plt.ylim(0,)
pearson_coef,p_value=stats.pearsonr(df["horse-power"],df["price"])
#ANOVA
df_anova=df[["make","price"]]
grouped_anova=df_anova.groupby(["make"])
anova_results_1=stats.f_oneway(grouped_anova.getgroup("honda")["price"],grouped_anova.getgroup("subaru")["price"])
#regression
from sklearn.linear_model import LinearRegression
lm=LinearRegression()
x=df[["highwaympg"]]
y=df[["price"]]
lm.fit(x,y) # simple linear regression SLR
z=df[["horsepower","curbweight","enginesize","highwaympg"]]
lm.fit(z,df["price"]) #MLR
yhat=lm.predict(x)
lm.intercept_
lm.coeff_
#residual plot...if randomly spread along x axis,linear regression is correct
#if it has a curvature, non linear model is appropriate
#if its unevenly spread around x axis, model is incorrect
import seaborn as sns
sns.residplot(df["highwaympg"],["price"])
#distribution plot
ax1=sns.distplot(df["price"],hist=False,color="r",label="Actual Value")
sns.distplot(Yhat,hist=False,color="b", label="fitted values",ax=ax1)
#polynomial regression of 3rd order
f=np.polyfit(x,y,3)
p=np.polyld(f)
print(p)
#for polynomial with more than 1 dimension
from sklesrn.preprocessing import PolynomialFestures
pr=PolynomialFeatures(degree=2, include_bias=False)
#using pipeline to perform 1.Normalization 2.Transformation 3.Preeiction ...simplifies the code
#mean squared error
from sklearn.metrics import mean_squared_error
mean_squared_error(df["price"],Y_predict_simple_fit)
#r squared value
x = df["highwaympg"]
y = df["price"]
lm.fit(x,y)
lm.score(x,y)
#if r square is negative it cud be due to over fitting
#training and test set split - model evaluation
from sklearn.model_selection import train_test_split
y_data = df["price"]
#x_data = features or independent variables
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=.3,random_state=0)
#cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
scores=cross_val_score(lr,x_data,y_data,cv=3)
np.mean(scores)
yhat = cross_val_predict(lr2e,x_data,y_data,cv=3)
#ridge regression
from sklearn.linear_model import Ridge
RidgeModel = Ridge(alpha=.1)
RidgeModel.fit(X,y)
Yhat = RidgeModel.predict(X)
From:https://stackoverflow.com/questions/50302180/difference-between-dfx-dfx-dfx-dfx-and-df-x
df[x] — index a column using variable x. Returns pd.Series
df[[x]] — index/slice a single-column DataFrame using variable x. Returns pd.DataFrame
df['x'] — index a column named 'x'. Returns pd.Series
df[['x']] — index/slice a single-column DataFrame having only one column named 'x'. Returns pd.DataFrame
df.x — dot accessor notation, equivalent to df['x'] (there are, however, limitations on what x can be named if dot notation is to be successfully used). Returns pd.Series
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment