Salil Athalye salilathalye

## gist:80bb089f40564cef9c74457e383e5e62
BIM Info =
VAR TableInfo =
    ADDCOLUMNS (
        INFO.VIEW.TABLES (),
        "Component", "Tables"
    )
VAR ColumnInfo =
    ADDCOLUMNS (
        INFO.VIEW.COLUMNS (),
        "Component", "Columns"

## impute_missing_in_df.py
def impute_missing(df):
  '''
  Impute categorical with mode
  Impute numeric with mean
  '''
  categorical_cols = df.select_dtypes(include=['object','category']).columns
  numeric_cols = df.select_dtypes(include=['number']).columns
  for cat_col in categorical_cols:
    df[cat_col] = df[cat_col].fillna(df[cat_col].value_counts()[0])
  for num_col in numeric_cols:

## trim_string_columns_in_dataframe.py
def trim_all_columns(df):
    """
    https://stackoverflow.com/questions/40950310/strip-trim-all-strings-of-a-dataframe
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)

## pathlib_cookiecutter_paths.py
from pathlib import Path
# Uses cookiecutter datascience template
# This jupyter notebook is in the notebooks directory

notebook_path = Path('.').resolve()
project_path = notebook_path.parents[0]
data_raw_path = project_path / 'data' / 'raw'
data_interim_path = project_path / 'data' / 'interim'
data_processed_path = project_path / 'data' / 'processed'

## pandas_profiling_eda_jupyter.py
from pandas_profiling import ProfileReport
profile = ProfileReport(training_data, title='Pandas Profiling Report', explorative=True)
profile.to_file("training_data_profile.html")
profile.to_notebook_iframe()

## categorical_summary.py
def categorical_summary(df):
  '''
  Adapted from https://www.kaggle.com/nextbigwhat/eda-for-categorical-variables-part-2

  Returns a dataframe containing information about categorical columns
  Column name is set as the index
  '''
  categorical_cols = df.select_dtypes(include='object').columns
  summary_df = pd.DataFrame(columns=
      [

## seaborn_correlation_heatmap.py
def plot_correlaton_heatmap(df):
  numeric_cols = df.select_dtypes(exclude='object').columns
  plt.figure(figsize=(10,8))
  sns.heatmap(df[numeric_cols].corr(), cmap='RdBu_r', annot=True)
  print(plt.show())

## dtale_colab
import dtale
import dtale.app as dtale_app

dtale_app.USE_NGROK = True

dtale.show(training_data, ignore_duplicate=True)

## cookiecutter-datascience-template
cookiecutter https://github.com/drivendata/cookiecutter-data-science

## git_init_main_branch
git config --global init.defaultBranch main
	BIM Info =
	VAR TableInfo =
	ADDCOLUMNS (
	INFO.VIEW.TABLES (),
	"Component", "Tables"
	)
	VAR ColumnInfo =
	ADDCOLUMNS (
	INFO.VIEW.COLUMNS (),
	"Component", "Columns"
	def impute_missing(df):
	'''
	Impute categorical with mode
	Impute numeric with mean
	'''
	categorical_cols = df.select_dtypes(include=['object','category']).columns
	numeric_cols = df.select_dtypes(include=['number']).columns
	for cat_col in categorical_cols:
	df[cat_col] = df[cat_col].fillna(df[cat_col].value_counts()[0])
	for num_col in numeric_cols:
	def trim_all_columns(df):
	"""
	https://stackoverflow.com/questions/40950310/strip-trim-all-strings-of-a-dataframe
	Trim whitespace from ends of each value across all series in dataframe
	"""
	trim_strings = lambda x: x.strip() if isinstance(x, str) else x
	return df.applymap(trim_strings)
	from pathlib import Path
	# Uses cookiecutter datascience template
	# This jupyter notebook is in the notebooks directory

	notebook_path = Path('.').resolve()
	project_path = notebook_path.parents[0]
	data_raw_path = project_path / 'data' / 'raw'
	data_interim_path = project_path / 'data' / 'interim'
	data_processed_path = project_path / 'data' / 'processed'
	from pandas_profiling import ProfileReport
	profile = ProfileReport(training_data, title='Pandas Profiling Report', explorative=True)
	profile.to_file("training_data_profile.html")
	profile.to_notebook_iframe()
	def categorical_summary(df):
	'''
	Adapted from https://www.kaggle.com/nextbigwhat/eda-for-categorical-variables-part-2

	Returns a dataframe containing information about categorical columns
	Column name is set as the index
	'''
	categorical_cols = df.select_dtypes(include='object').columns
	summary_df = pd.DataFrame(columns=
	[
	def plot_correlaton_heatmap(df):
	numeric_cols = df.select_dtypes(exclude='object').columns
	plt.figure(figsize=(10,8))
	sns.heatmap(df[numeric_cols].corr(), cmap='RdBu_r', annot=True)
	print(plt.show())
	import dtale
	import dtale.app as dtale_app

	dtale_app.USE_NGROK = True

	dtale.show(training_data, ignore_duplicate=True)