Created
May 30, 2019 14:58
-
-
Save abdel1979/3980860e29ed1c86dde8aa1a79c954d2 to your computer and use it in GitHub Desktop.
Kaggle competition , my profile https://www.kaggle.com/ouassini
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.metrics import mean_absolute_error | |
from xgboost import XGBRegressor | |
# Read the data | |
X_full = pd.read_csv('train.csv', index_col='Id') | |
X_test_full = pd.read_csv('test.csv', index_col='Id') | |
# Remove rows with missing target, separate target from predictors | |
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True) | |
y = X_full.SalePrice | |
X_full.drop(['SalePrice'], axis=1, inplace=True) | |
# Break off validation set from training data | |
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, | |
train_size=0.8, test_size=0.2, | |
random_state=0) | |
#cols_with_missing = ['Alley','PoolQC','MiscFeature','Fence','FireplaceQu','LotFrontage'] | |
cols_with_missing = [col for col in X_train_full.columns | |
if X_train_full[col].isnull().any()] | |
print(X_train_full.shape) | |
missing_val_count_by_column = (X_train_full.isnull().sum()) | |
print(missing_val_count_by_column[missing_val_count_by_column > 0]) | |
# Drop columns in training and validation data | |
X_train_full = X_train_full.drop(cols_with_missing, axis=1) | |
X_valid_full = X_valid_full.drop(cols_with_missing, axis=1) | |
X_test_full = X_test_full.drop(cols_with_missing, axis=1) | |
missing_val_count_by_column = (X_train_full.isnull().sum()) | |
print(missing_val_count_by_column[missing_val_count_by_column > 0]) | |
# "Cardinality" means the number of unique values in a column | |
# Select categorical columns with relatively low cardinality (convenient but arbitrary) | |
categorical_cols = [cname for cname in X_train_full.columns if | |
X_train_full[cname].nunique() < 10 and | |
X_train_full[cname].dtype == "object"] | |
# Select numerical columns | |
numerical_cols = [cname for cname in X_train_full.columns if | |
X_train_full[cname].dtype in ['int64', 'float64']] | |
# Keep selected columns only | |
my_cols = categorical_cols + numerical_cols | |
X_train = X_train_full[my_cols].copy() | |
X_valid = X_valid_full[my_cols].copy() | |
X_test = X_test_full[my_cols].copy() | |
# Preprocessing for numerical data | |
numerical_transformer = SimpleImputer(strategy='median') | |
# Preprocessing for categorical data | |
categorical_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='constant')), | |
('onehot', OneHotEncoder(handle_unknown='ignore')) | |
]) | |
# Bundle preprocessing for numerical and categorical data | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
('num', numerical_transformer, numerical_cols), | |
('cat', categorical_transformer, categorical_cols) | |
]) | |
# Define model | |
model = XGBRegressor(n_estimators=2000, learning_rate=0.02, n_jobs=4) | |
# Bundle preprocessing and modeling code in a pipeline | |
MyPip = Pipeline(steps=[('preprocessor', preprocessor), | |
('model', model) | |
]) | |
# | |
# Preprocessing of training data, fit model | |
MyPip.fit(X_train, y_train) | |
# Preprocessing of validation data, get predictions | |
preds = MyPip.predict(X_valid) | |
# preds = MyPip.predict(X_test) | |
print('MAE:', mean_absolute_error(y_valid, preds)) | |
# | |
# # Save test predictions to file | |
# output = pd.DataFrame({'Id': X_test.index, | |
# 'SalePrice': preds}) | |
# output.to_csv('submission.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment