Ashis Kumar Panda CaptainAshis

## dl_1.py
# Step 1:-
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

## dt_feature9.py
# Step 13

df, y, nas, mapper = proc_df(joined_samp, 'Sales', do_scale=True)
yl = np.log(y)


joined_test = joined_test.set_index("Date")

df_test, _, nas, mapper = proc_df(joined_test, 'Sales', do_scale=True, skip_flds=['Id'],
                                  mapper=mapper, na_dict=nas)

## dt_feature8.py
# Step 12
joined = pd.read_feather(f'{PATH_WRITE}joined')
joined_test = pd.read_feather(f'{PATH_WRITE}joined_test')

joined.head().T.head(40)

# Defining the categorical and continious variables
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',

## dt_feature7.py
# Step 11.
# It is common when working with time series data to extract data that explains relationships across rows as opposed to columns, e.g.:
# * Running averages
# * Time until next event
# * Time since last event
# This is often difficult to do with most table manipulation frameworks, since they are designed to work with relationships
# across columns. As such, we've created a class to handle this type of data.
# We'll define a function `get_elapsed` for cumulative counting across a sorted dataframe.
# Given a particular field `fld` to monitor, this function will start tracking time since the last occurrence of that field.
# When the field is seen again, the counter is set to zero.

## dt_feature6.py
# Step 10.
for df in (joined,joined_test):
    df["Promo2Since"] = pd.to_datetime(df.apply(lambda x: Week(x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1).astype(pd.datetime))
#     The above line of code returns the date which corresponds to Monday of a particular year and a particular week.
    df["Promo2Days"] = df.Date.subtract(df["Promo2Since"]).dt.days

################################################################################
# The commented out example below shows how Line #3 works
# w = Week(2011, 20)
# print ("Week %s starts on %s" % (w, w.monday()))

## dt_feature5.py
# Step 9
# We'll replace some erroneous / outlying data.

for df in (joined,joined_test):
    df.loc[df.CompetitionDaysOpen<0, "CompetitionDaysOpen"] = 0
    df.loc[df.CompetitionOpenSinceYear<1990, "CompetitionDaysOpen"] = 0

# We add "CompetitionMonthsOpen" field, limiting the maximum to 2 years to limit number of unique categories.
# This will help later-on during feature Engineering.

## dt_feature4.py
# Step 8
# Next we'll fill in missing values to avoid complications with NA's.
# NA (not available) is how Pandas indicates missing values; many models have problems when missing values are present,
# so it's always important to think about how to deal with them.
# In these cases, we are picking an arbitrary signal value that doesn't otherwise appear in the data.

for df in (joined,joined_test):
    df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
    df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
    df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)

## dt_feature3.py
# Step 7
#  The Google trends data has a special category for the whole of the Germany - we'll pull that out so we can use it explicitly.
trend_de = googletrend[googletrend.file == 'Rossmann_DE']

# Now we can outer join all of our data into a single dataframe.
# Recall that in outer joins everytime a value in the joining field on the left table does not have a corresponding value
# on the right table, the corresponding row in the new table has Null values for all right table fields.
# One way to check that all records are consistent and complete is to check for Null values post-join, as we do here.

# Aside: Why note just do an inner join? If you are assuming that all records are complete and match on the field you desire,

## dt_feature2.py
# Step 6
# The following extracts particular date fields from a complete datetime for the purpose of constructing categoricals.
# You should *always* consider this feature extraction step when working with date-time.
# Without expanding your date-time into these additional fields, you can't capture any trend/cyclical behavior as a function of
# time at any of these granularities.
# We'll add to every table with a date field.

add_datepart(weather, "Date", drop=False)
add_datepart(googletrend, "Date", drop=False)
add_datepart(train, "Date", drop=False)

## dt_feature1.py
# Step 5
# In pandas you can add new columns to a dataframe by simply defining it.
# We'll do this for googletrends by extracting dates and state names from the given data and adding those columns.
# We're also going to replace all instances of state name 'NI' to match the usage in the rest of the data: 'HB,NI'.
# This is a good opportunity to highlight pandas indexing. We can use `.loc[rows, cols]` to select a list of rows and a
# list of columns from the dataframe.
# In this case, we're selecting rows w/ statename 'NI' by using a boolean list `googletrend.State=='NI'` and selecting "State".

googletrend['Date'] = googletrend.week.str.split(' - ', expand=True)[0]
googletrend['State'] = googletrend.file.str.split('_', expand=True)[2]
	# Step 1:-
	def inv_y(a): return np.exp(a)

	def exp_rmspe(y_pred, targ):
	targ = inv_y(targ)
	pct_var = (targ - inv_y(y_pred))/targ
	return math.sqrt((pct_var**2).mean())

	max_log_y = np.max(yl)
	y_range = (0, max_log_y*1.2)
	# Step 13

	df, y, nas, mapper = proc_df(joined_samp, 'Sales', do_scale=True)
	yl = np.log(y)


	joined_test = joined_test.set_index("Date")

	df_test, _, nas, mapper = proc_df(joined_test, 'Sales', do_scale=True, skip_flds=['Id'],
	mapper=mapper, na_dict=nas)
	# Step 12
	joined = pd.read_feather(f'{PATH_WRITE}joined')
	joined_test = pd.read_feather(f'{PATH_WRITE}joined_test')

	joined.head().T.head(40)

	# Defining the categorical and continious variables
	cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
	'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
	'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
	# Step 11.
	# It is common when working with time series data to extract data that explains relationships across rows as opposed to columns, e.g.:
	# * Running averages
	# * Time until next event
	# * Time since last event
	# This is often difficult to do with most table manipulation frameworks, since they are designed to work with relationships
	# across columns. As such, we've created a class to handle this type of data.
	# We'll define a function `get_elapsed` for cumulative counting across a sorted dataframe.
	# Given a particular field `fld` to monitor, this function will start tracking time since the last occurrence of that field.
	# When the field is seen again, the counter is set to zero.
	# Step 10.
	for df in (joined,joined_test):
	df["Promo2Since"] = pd.to_datetime(df.apply(lambda x: Week(x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1).astype(pd.datetime))
	# The above line of code returns the date which corresponds to Monday of a particular year and a particular week.
	df["Promo2Days"] = df.Date.subtract(df["Promo2Since"]).dt.days

	################################################################################
	# The commented out example below shows how Line #3 works
	# w = Week(2011, 20)
	# print ("Week %s starts on %s" % (w, w.monday()))
	# Step 9
	# We'll replace some erroneous / outlying data.

	for df in (joined,joined_test):
	df.loc[df.CompetitionDaysOpen<0, "CompetitionDaysOpen"] = 0
	df.loc[df.CompetitionOpenSinceYear<1990, "CompetitionDaysOpen"] = 0

	# We add "CompetitionMonthsOpen" field, limiting the maximum to 2 years to limit number of unique categories.
	# This will help later-on during feature Engineering.
	# Step 8
	# Next we'll fill in missing values to avoid complications with NA's.
	# NA (not available) is how Pandas indicates missing values; many models have problems when missing values are present,
	# so it's always important to think about how to deal with them.
	# In these cases, we are picking an arbitrary signal value that doesn't otherwise appear in the data.

	for df in (joined,joined_test):
	df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32)
	df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32)
	df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32)
	# Step 7
	# The Google trends data has a special category for the whole of the Germany - we'll pull that out so we can use it explicitly.
	trend_de = googletrend[googletrend.file == 'Rossmann_DE']

	# Now we can outer join all of our data into a single dataframe.
	# Recall that in outer joins everytime a value in the joining field on the left table does not have a corresponding value
	# on the right table, the corresponding row in the new table has Null values for all right table fields.
	# One way to check that all records are consistent and complete is to check for Null values post-join, as we do here.

	# Aside: Why note just do an inner join? If you are assuming that all records are complete and match on the field you desire,
	# Step 6
	# The following extracts particular date fields from a complete datetime for the purpose of constructing categoricals.
	# You should always consider this feature extraction step when working with date-time.
	# Without expanding your date-time into these additional fields, you can't capture any trend/cyclical behavior as a function of
	# time at any of these granularities.
	# We'll add to every table with a date field.

	add_datepart(weather, "Date", drop=False)
	add_datepart(googletrend, "Date", drop=False)
	add_datepart(train, "Date", drop=False)
	# Step 5
	# In pandas you can add new columns to a dataframe by simply defining it.
	# We'll do this for googletrends by extracting dates and state names from the given data and adding those columns.
	# We're also going to replace all instances of state name 'NI' to match the usage in the rest of the data: 'HB,NI'.
	# This is a good opportunity to highlight pandas indexing. We can use `.loc[rows, cols]` to select a list of rows and a
	# list of columns from the dataframe.
	# In this case, we're selecting rows w/ statename 'NI' by using a boolean list `googletrend.State=='NI'` and selecting "State".

	googletrend['Date'] = googletrend.week.str.split(' - ', expand=True)[0]
	googletrend['State'] = googletrend.file.str.split('_', expand=True)[2]