Skip to content

Instantly share code, notes, and snippets.

View MariaLavrovskaya's full-sized avatar

Maria MariaLavrovskaya

  • London, United Kingdom
View GitHub Profile
#Data preparation
data_1 = data.drop(['id', 'name', 'host_id','host_name','neighbourhood','latitude', 'last_review', 'longitude', 'room_type'], axis =1)
data_1.dropna(how='any', inplace=True)
data_1.head()
#Label Encoding of the labels
from sklearn.preprocessing import LabelEncoder
area_encoder = LabelEncoder()
data_y = data_1['neighbourhood_group']
from sklearn import linear_model
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_2, y,test_size = 0.25, random_state = 0)
# Create linear regression object
regr = linear_model.LinearRegression(fit_intercept=True) # Do not use fit_intercept = False if you have removed 1 column after dummy encoding
# Train the model using the training sets
regr.fit(X_train, Y_train)
y_pred = regr.predict(X_test)
#Checking between observed and predicted data
# Resulted feature matrix with all of independent variables
X_2 = np.concatenate((scaled_columns,X_train_ohe),axis=1)
#Treating continous variables with Standart Scaler
columns_to_scale = np.array(df_1['runtime'])
#Initiate Scaler:
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(columns_to_scale[:, np.newaxis])
#From labels to dummy
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
X_train_ohe = ohe.fit_transform(X_train_le)
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
le = preprocessing.LabelEncoder()
# LabelEncoder for a number of columns
class MultiColumnLabelEncoder:
def __init__(self, columns = None):
#Splitting for 2 matrices: independent variables used for prediction and dependent variables (that is predicted)
X = df_1.drop(['price', 'reviews_per_month'], axis = 1) #Feature Matrix
y = df_1["price"] #Dependent Variables
print(df_1.isnull().values.sum())
print(df_1.isnull().sum())
#Dropping missing values from my dataset
df_1.dropna(how='any', inplace=True)
print(df_1.isnull().values.sum()) #checking for missing values after the dropna()
df_1 = data.loc[:, ['neighbourhood_group', 'neighbourhood','room_type', 'price', 'minimum_nights',
'number_of_reviews', 'reviews_per_month']]
f_obs = np.array([contingency_table.iloc[0][0:4].values,
contingency_table.iloc[1][0:4].values,
contingency_table.iloc[2][0:4].values,
contingency_table.iloc[3][0:4].values,
contingency_table.iloc[4][0:4].values])
from scipy import stats
stats.chi2_contingency(f_obs)[0:3]
###В связи с этим, мы отрицаем нулевую гипотезу и принимаем альтернативную гипотезу, которая утверждает, что
###существует непосредственная взаимосвязь между расположением и типом сдаваемой недвижимости на сайте AIRBNB.