Maria MariaLavrovskaya

## svm_1.py

#Data preparation
data_1 = data.drop(['id', 'name', 'host_id','host_name','neighbourhood','latitude',  'last_review', 'longitude', 'room_type'], axis =1)
data_1.dropna(how='any', inplace=True)
data_1.head()

#Label Encoding of the labels
from sklearn.preprocessing import LabelEncoder
area_encoder = LabelEncoder()
data_y = data_1['neighbourhood_group']

## airbnb_21.py
from sklearn import linear_model
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_2, y,test_size = 0.25, random_state = 0)
# Create linear regression object
regr = linear_model.LinearRegression(fit_intercept=True) # Do not use fit_intercept = False if you have removed 1 column after dummy encoding
# Train the model using the training sets
regr.fit(X_train, Y_train)
y_pred = regr.predict(X_test)

#Checking between observed and predicted data

## airbnb_20.py
# Resulted feature matrix with all of independent variables
X_2 = np.concatenate((scaled_columns,X_train_ohe),axis=1)

## airbnb_18.py
#Treating continous variables with Standart Scaler

columns_to_scale = np.array(df_1['runtime'])
#Initiate Scaler:
scaler = StandardScaler()
scaled_columns  = scaler.fit_transform(columns_to_scale[:, np.newaxis])

## airbnb_17.py
#From labels to dummy
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
X_train_ohe = ohe.fit_transform(X_train_le)

## airbnb_16.py
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
le = preprocessing.LabelEncoder()


# LabelEncoder for a number of columns
class MultiColumnLabelEncoder:

    def __init__(self, columns = None):

## airbnb_14.py
#Splitting for 2 matrices: independent variables used for prediction and dependent variables (that is predicted)
X = df_1.drop(['price', 'reviews_per_month'], axis = 1)   #Feature Matrix
y = df_1["price"] #Dependent Variables

## airbnb_12.py
print(df_1.isnull().values.sum())
print(df_1.isnull().sum())

#Dropping missing values from my dataset
df_1.dropna(how='any', inplace=True)
print(df_1.isnull().values.sum()) #checking for missing values after the dropna()

## airbnb_11.py
df_1 = data.loc[:, ['neighbourhood_group', 'neighbourhood','room_type', 'price', 'minimum_nights',
                  'number_of_reviews', 'reviews_per_month']]


## airbnb_12.py
f_obs = np.array([contingency_table.iloc[0][0:4].values,
                  contingency_table.iloc[1][0:4].values,
                  contingency_table.iloc[2][0:4].values,
                  contingency_table.iloc[3][0:4].values,
                  contingency_table.iloc[4][0:4].values])
from scipy import stats
stats.chi2_contingency(f_obs)[0:3]
###В связи с этим, мы отрицаем нулевую гипотезу и принимаем альтернативную гипотезу, которая утверждает, что
###существует непосредственная взаимосвязь между расположением и типом сдаваемой недвижимости на сайте AIRBNB.

	#Data preparation
	data_1 = data.drop(['id', 'name', 'host_id','host_name','neighbourhood','latitude', 'last_review', 'longitude', 'room_type'], axis =1)
	data_1.dropna(how='any', inplace=True)
	data_1.head()

	#Label Encoding of the labels
	from sklearn.preprocessing import LabelEncoder
	area_encoder = LabelEncoder()
	data_y = data_1['neighbourhood_group']
	from sklearn import linear_model
	from sklearn.model_selection import train_test_split
	X_train, X_test, Y_train, Y_test = train_test_split(X_2, y,test_size = 0.25, random_state = 0)
	# Create linear regression object
	regr = linear_model.LinearRegression(fit_intercept=True) # Do not use fit_intercept = False if you have removed 1 column after dummy encoding
	# Train the model using the training sets
	regr.fit(X_train, Y_train)
	y_pred = regr.predict(X_test)

	#Checking between observed and predicted data
	# Resulted feature matrix with all of independent variables
	X_2 = np.concatenate((scaled_columns,X_train_ohe),axis=1)
	#Treating continous variables with Standart Scaler

	columns_to_scale = np.array(df_1['runtime'])
	#Initiate Scaler:
	scaler = StandardScaler()
	scaled_columns = scaler.fit_transform(columns_to_scale[:, np.newaxis])
	#From labels to dummy
	from sklearn.preprocessing import OneHotEncoder
	ohe = OneHotEncoder(sparse=False)
	X_train_ohe = ohe.fit_transform(X_train_le)
	from sklearn.preprocessing import StandardScaler
	from sklearn import preprocessing
	from sklearn.preprocessing import LabelEncoder
	le = preprocessing.LabelEncoder()


	# LabelEncoder for a number of columns
	class MultiColumnLabelEncoder:

	def __init__(self, columns = None):
	#Splitting for 2 matrices: independent variables used for prediction and dependent variables (that is predicted)
	X = df_1.drop(['price', 'reviews_per_month'], axis = 1) #Feature Matrix
	y = df_1["price"] #Dependent Variables
	print(df_1.isnull().values.sum())
	print(df_1.isnull().sum())

	#Dropping missing values from my dataset
	df_1.dropna(how='any', inplace=True)
	print(df_1.isnull().values.sum()) #checking for missing values after the dropna()
	df_1 = data.loc[:, ['neighbourhood_group', 'neighbourhood','room_type', 'price', 'minimum_nights',
	'number_of_reviews', 'reviews_per_month']]
	f_obs = np.array([contingency_table.iloc[0][0:4].values,
	contingency_table.iloc[1][0:4].values,
	contingency_table.iloc[2][0:4].values,
	contingency_table.iloc[3][0:4].values,
	contingency_table.iloc[4][0:4].values])
	from scipy import stats
	stats.chi2_contingency(f_obs)[0:3]
	###В связи с этим, мы отрицаем нулевую гипотезу и принимаем альтернативную гипотезу, которая утверждает, что
	###существует непосредственная взаимосвязь между расположением и типом сдаваемой недвижимости на сайте AIRBNB.