Vidit Shah vidit0210

## SagemakerAutoPilotDataPreparationCourse
video Link : https://www.youtube.com/watch?v=qMEtqJPhqpA
Julien Simon
-----

import pandas as pd

data = pd.read_csv('Your CSV File')
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50)         # Keep the output on one page
data[:10]

## Kaggle Competition Data Camp.
----
Data - https://www.kaggle.com/c/demand-forecasting-kernels-only/data
DEMAND FORECASTING CHALLENGE
----
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Read the train data
train = pd.read_csv('train.csv')

## Feature Engineering for Machine Learning in Python-DataCamp
----
Selecting specific data types
----
# Create subset of only the numeric columns
so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])

# Print the column names contained in so_survey_df_num
print(so_numeric_df.columns)
---
One-hot encoding and dummy variables

## Preprocessing in Machine Learning-DataCamp
---
Missing data - rows
---
# Check how many values are missing in the category_desc column
print(volunteer["category_desc"].isnull().sum())

# Subset the volunteer dataset
volunteer_subset = volunteer[volunteer["category_desc"].notnull()]

# Print out the shape of the subset

## KIDNEY CASE STUDY
DataLink : https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer

# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)

## XG-Boost-DataCamp
----
Starting
----
# Import xgboost
import xgboost as xgb

# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the training and test sets

## Tree-Boosting-Bagging-CV
# Compute the array containing the 10-folds CV MSEs
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10,
                                  scoring='neg_mean_squared_error',
                                  n_jobs=-1)

# Compute the 10-folds CV RMSE
RMSE_CV = (MSE_CV_scores.mean())**(1/2)

# Print RMSE_CV
print('CV RMSE: {:.2f}'.format(RMSE_CV))

## Write-Read-RECORDIO
def write_recordio_file (filename, x, y=None):
    with open(filename, 'wb') as f:
        smac.write_numpy_to_dense_tensor(f, x, y)

def read_recordio_file (filename, recordsToPrint = 10):
    with open(filename, 'rb') as f:
        record = smac.read_records(f)
        for i, r in enumerate(record):
            if i >= recordsToPrint:
                break

## Write-Download-from-csv
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

## TargetColToFirstCol
cols = list(train_df)
cols.insert(0, cols.pop(cols.index('OUTPUT_LABEL')))
train_df = train_df.loc[:, cols]
	video Link : https://www.youtube.com/watch?v=qMEtqJPhqpA
	Julien Simon
	-----

	import pandas as pd

	data = pd.read_csv('Your CSV File')
	pd.set_option('display.max_columns', 500) # Make sure we can see all of the columns
	pd.set_option('display.max_rows', 50) # Keep the output on one page
	data[:10]
	----
	Data - https://www.kaggle.com/c/demand-forecasting-kernels-only/data
	DEMAND FORECASTING CHALLENGE
	----
	import pandas as pd
	from sklearn.ensemble import RandomForestRegressor

	# Read the train data
	train = pd.read_csv('train.csv')
	----
	Selecting specific data types
	----
	# Create subset of only the numeric columns
	so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])

	# Print the column names contained in so_survey_df_num
	print(so_numeric_df.columns)
	---
	One-hot encoding and dummy variables
	---
	Missing data - rows
	---
	# Check how many values are missing in the category_desc column
	print(volunteer["category_desc"].isnull().sum())

	# Subset the volunteer dataset
	volunteer_subset = volunteer[volunteer["category_desc"].notnull()]

	# Print out the shape of the subset
	DataLink : https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

	# Import necessary modules
	from sklearn_pandas import DataFrameMapper
	from sklearn_pandas import CategoricalImputer

	# Check number of nulls in each feature column
	nulls_per_column = X.isnull().sum()
	print(nulls_per_column)
	----
	Starting
	----
	# Import xgboost
	import xgboost as xgb

	# Create arrays for the features and the target: X, y
	X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

	# Create the training and test sets
	# Compute the array containing the 10-folds CV MSEs
	MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10,
	scoring='neg_mean_squared_error',
	n_jobs=-1)

	# Compute the 10-folds CV RMSE
	RMSE_CV = (MSE_CV_scores.mean())**(1/2)

	# Print RMSE_CV
	print('CV RMSE: {:.2f}'.format(RMSE_CV))
	def write_recordio_file (filename, x, y=None):
	with open(filename, 'wb') as f:
	smac.write_numpy_to_dense_tensor(f, x, y)

	def read_recordio_file (filename, recordsToPrint = 10):
	with open(filename, 'rb') as f:
	record = smac.read_records(f)
	for i, r in enumerate(record):
	if i >= recordsToPrint:
	break
	# Write and Reading from S3 is just as easy
	# files are referred as objects in S3.
	# file name is referred as key name in S3
	# Files stored in S3 are automatically replicated across 3 different availability zones
	# in the region where the bucket was created.

	# http://boto3.readthedocs.io/en/latest/guide/s3.html
	def write_to_s3(filename, bucket, key):
	with open(filename,'rb') as f: # Read in binary mode
	return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)
	cols = list(train_df)
	cols.insert(0, cols.pop(cols.index('OUTPUT_LABEL')))
	train_df = train_df.loc[:, cols]