Skip to content

Instantly share code, notes, and snippets.

View vidit0210's full-sized avatar
🎯
Focusing

Vidit Shah vidit0210

🎯
Focusing
View GitHub Profile
video Link : https://www.youtube.com/watch?v=qMEtqJPhqpA
Julien Simon
-----
import pandas as pd
data = pd.read_csv('Your CSV File')
pd.set_option('display.max_columns', 500) # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50) # Keep the output on one page
data[:10]
----
Data - https://www.kaggle.com/c/demand-forecasting-kernels-only/data
DEMAND FORECASTING CHALLENGE
----
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
# Read the train data
train = pd.read_csv('train.csv')
@vidit0210
vidit0210 / Feature Engineering for Machine Learning in Python-DataCamp
Created March 1, 2020 17:02
Feature Engineering for Machine Learning in Python-dataCamp
----
Selecting specific data types
----
# Create subset of only the numeric columns
so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])
# Print the column names contained in so_survey_df_num
print(so_numeric_df.columns)
---
One-hot encoding and dummy variables
---
Missing data - rows
---
# Check how many values are missing in the category_desc column
print(volunteer["category_desc"].isnull().sum())
# Subset the volunteer dataset
volunteer_subset = volunteer[volunteer["category_desc"].notnull()]
# Print out the shape of the subset
DataLink : https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)
----
Starting
----
# Import xgboost
import xgboost as xgb
# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]
# Create the training and test sets
# Compute the array containing the 10-folds CV MSEs
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10,
scoring='neg_mean_squared_error',
n_jobs=-1)
# Compute the 10-folds CV RMSE
RMSE_CV = (MSE_CV_scores.mean())**(1/2)
# Print RMSE_CV
print('CV RMSE: {:.2f}'.format(RMSE_CV))
def write_recordio_file (filename, x, y=None):
with open(filename, 'wb') as f:
smac.write_numpy_to_dense_tensor(f, x, y)
def read_recordio_file (filename, recordsToPrint = 10):
with open(filename, 'rb') as f:
record = smac.read_records(f)
for i, r in enumerate(record):
if i >= recordsToPrint:
break
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones
# in the region where the bucket was created.
# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
with open(filename,'rb') as f: # Read in binary mode
return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)
cols = list(train_df)
cols.insert(0, cols.pop(cols.index('OUTPUT_LABEL')))
train_df = train_df.loc[:, cols]