CaptainAshis/dt_feature3.py

## dt_feature3.py
# Step 7
#  The Google trends data has a special category for the whole of the Germany - we'll pull that out so we can use it explicitly.
trend_de = googletrend[googletrend.file == 'Rossmann_DE']

# Now we can outer join all of our data into a single dataframe.
# Recall that in outer joins everytime a value in the joining field on the left table does not have a corresponding value
# on the right table, the corresponding row in the new table has Null values for all right table fields.
# One way to check that all records are consistent and complete is to check for Null values post-join, as we do here.

# Aside: Why note just do an inner join? If you are assuming that all records are complete and match on the field you desire,
# an inner join will do the same thing as an outer join. However, in the event you are wrong or a mistake is made,
# an outer join followed by a null-check will catch it.
# (Comparing before/after # of rows for inner join is equivalent, but requires keeping track of before/after row #'s.
# Outer join is easier.)


store = join_df(store, store_states, "Store")
len(store[store.State.isnull()])

# 0 -No null values present

# Joining of all the dataframes
joined = join_df(train, store, "Store")
joined_test = join_df(test, store, "Store")
len(joined[joined.StoreType.isnull()]),len(joined_test[joined_test.StoreType.isnull()])


joined = join_df(joined, googletrend, ["State","Year", "Week"])
joined_test = join_df(joined_test, googletrend, ["State","Year", "Week"])
len(joined[joined.trend.isnull()]),len(joined_test[joined_test.trend.isnull()])


joined = joined.merge(trend_de, 'left', ["Year", "Week"], suffixes=('', '_DE'))
joined_test = joined_test.merge(trend_de, 'left', ["Year", "Week"], suffixes=('', '_DE'))
len(joined[joined.trend_DE.isnull()]),len(joined_test[joined_test.trend_DE.isnull()])


joined = join_df(joined, weather, ["State","Date"])
joined_test = join_df(joined_test, weather, ["State","Date"])
len(joined[joined.Mean_TemperatureC.isnull()]),len(joined_test[joined_test.Mean_TemperatureC.isnull()])


# Dropping the extra columns
for df in (joined, joined_test):
    for c in df.columns:
        if c.endswith('_y'):
            if c in df.columns: df.drop(c, inplace=True, axis=1)
	# Step 7
	# The Google trends data has a special category for the whole of the Germany - we'll pull that out so we can use it explicitly.
	trend_de = googletrend[googletrend.file == 'Rossmann_DE']

	# Now we can outer join all of our data into a single dataframe.
	# Recall that in outer joins everytime a value in the joining field on the left table does not have a corresponding value
	# on the right table, the corresponding row in the new table has Null values for all right table fields.
	# One way to check that all records are consistent and complete is to check for Null values post-join, as we do here.

	# Aside: Why note just do an inner join? If you are assuming that all records are complete and match on the field you desire,
	# an inner join will do the same thing as an outer join. However, in the event you are wrong or a mistake is made,
	# an outer join followed by a null-check will catch it.
	# (Comparing before/after # of rows for inner join is equivalent, but requires keeping track of before/after row #'s.
	# Outer join is easier.)


	store = join_df(store, store_states, "Store")
	len(store[store.State.isnull()])

	# 0 -No null values present

	# Joining of all the dataframes
	joined = join_df(train, store, "Store")
	joined_test = join_df(test, store, "Store")
	len(joined[joined.StoreType.isnull()]),len(joined_test[joined_test.StoreType.isnull()])


	joined = join_df(joined, googletrend, ["State","Year", "Week"])
	joined_test = join_df(joined_test, googletrend, ["State","Year", "Week"])
	len(joined[joined.trend.isnull()]),len(joined_test[joined_test.trend.isnull()])


	joined = joined.merge(trend_de, 'left', ["Year", "Week"], suffixes=('', '_DE'))
	joined_test = joined_test.merge(trend_de, 'left', ["Year", "Week"], suffixes=('', '_DE'))
	len(joined[joined.trend_DE.isnull()]),len(joined_test[joined_test.trend_DE.isnull()])


	joined = join_df(joined, weather, ["State","Date"])
	joined_test = join_df(joined_test, weather, ["State","Date"])
	len(joined[joined.Mean_TemperatureC.isnull()]),len(joined_test[joined_test.Mean_TemperatureC.isnull()])


	# Dropping the extra columns
	for df in (joined, joined_test):
	for c in df.columns:
	if c.endswith('_y'):
	if c in df.columns: df.drop(c, inplace=True, axis=1)