acbart/common_jupyter_header.py

## common_jupyter_header.py
## Generally useful built libraries
import sys
import os
import json
import math
import re
import itertools
from collections import defaultdict
from dataclasses import dataclass
from datetime import timedelta, datetime

## Core Data Science Libraries

# Main matplotlib usage is with `plt.whatever`
import matplotlib.pyplot as plt
# Sometimes you need style stuff via `mpl.whatever`
import matplotlib as mpl
# Statistic functions like `st.f_oneway`, `st.pearsonr`, etc.
import scipy.stats as st
# Your entire life is now pd.DataFrame(), pd.concat, etc.
import pandas as pd
# Real plotters user Seaborn over Matplotlib: sns.histplot, sns.jointplot, sns.lmplot, etc.
import seaborn as sns
# Sometimes you have to use numpy stuff, but almost always better to stay in Pandas
import numpy as np

## Quality of Life Libraries

# Progress bars! Just do:
#   for an_item in tqdm(an_iterable):
from tqdm.notebook import tqdm

# Need to render HTML or Code? Combine them with `display`
# Sometimes you will want to `print`, and sometimes `display`
from IPython.display import display, HTML, Code

## More Situational Libraries

# Situational, but sometimes natsorting is helpful
from natsort import index_natsorted, natsorted, order_by_index, natsort_keygen

# Some more useful statistic related functions in here
import sklearn.metrics as metrics

# More stats stuff, mixed effects models, generalized linear models, ODEs, etc.
import statsmodels as sm

# Probably won't need to use hidden markov models
#from hmmlearn import hmm


### Style Stuff
# Recommend putting this in a separate cell

# At the minimum, make the background white so you can copy/paste into Discord :)
mpl.rcParams['figure.facecolor'] = 'white'

# Calculations I did one time for LaTeX papers, hope they're accurate..?
COLUMN_WIDTH = 240/72.27
TEXT_WIDTH = 504/72.27

# I liked this style, but use your best judgement
plt.style.use('seaborn-v0_8-whitegrid')

# Setup nice fonts for an ACM LaTeX paper
nice_fonts = {
    # Ideally, you should enable this if you have LaTeX installed
    # It'll make the graphs match the paper font much more closely.
    # "text.usetex": True,
    "text.usetex": False,

    "font.family": "serif",
    # Use 10pt font in plots, to match 10pt font in document
    "axes.labelsize": 10,
    "font.size": 10,
    # Make the legend/label fonts a little smaller
    "legend.fontsize": 8,
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
}
mpl.rcParams.update(nice_fonts)

# Stop using scientific notation, show two decimal places instead.
pd.set_option('display.float_format', '{:.2f}'.format)

### Helper Functions

def save_figure(filename, fig=None, folder='reports/figures/'):
    """ Helper function to quickly save figures for the paper. """
    if fig is None:
        fig = plt
    fig.savefig(os.path.join(folder, filename), format='pdf', bbox_inches='tight')

def display_code(data):
    return display(Code(data))

display_code("a = 0")

def set_size(width, fraction=1, subplots=(1, 1)):
    """ Set figure dimensions to avoid scaling in LaTeX.

    Parameters
    ----------
    width: float or string
            Document width in points, or string of predined document type
    fraction: float, optional
            Fraction of the width which you wish the figure to occupy
    subplots: array-like, optional
            The number of rows and columns of subplots.
    Returns
    -------
    fig_dim: tuple
            Dimensions of figure in inches
    """
    if width == 'thesis':
        width_pt = 426.79135
    elif width == 'beamer':
        width_pt = 307.28987
    elif width == 'pnas':
        width_pt = 246.09686
    else:
        width_pt = width

    # Width of figure (in pts)
    fig_width_pt = width_pt * fraction
    # Convert from pt to inches
    inches_per_pt = 1 / 72.27

    # Golden ratio to set aesthetic figure height
    # https://disq.us/p/2940ij3
    golden_ratio = (5**.5 - 1) / 2

    # Figure width in inches
    fig_width_in = fig_width_pt * inches_per_pt
    # Figure height in inches
    fig_height_in = fig_width_in * golden_ratio * (subplots[0] / subplots[1])

    return (fig_width_in, fig_height_in)

import re

def tex_escape(text):
    """
        :param text: a plain text message
        :return: the message escaped to appear correctly in LaTeX
    """
    conv = {
        '&': r'\&',
        '%': r'\%',
        '$': r'\$',
        '#': r'\#',
        '_': r'\_',
        '{': r'\{',
        '}': r'\}',
        '~': r'\textasciitilde{}',
        '^': r'\^{}',
        '\\': r'\textbackslash{}',
        '<': r'\textless{}',
        '>': r'\textgreater{}',
    }
    regex = re.compile('|'.join(re.escape(str(key)) for key in sorted(conv.keys(), key = lambda item: - len(item))))
    return regex.sub(lambda match: conv[match.group()], text)

### Good Research Stuff
# Choose a consistent alpha threshold for P-Value calculations
ALPHA = .05

def proportion_stats(incidences, total, label=None):
    """ Pretty prints a proportion with APA-style statistics. """
    proportion = incidences/total
    rounded_proportion = round(100*proportion*10)/10
    standard_deviation = math.sqrt(proportion*(1-proportion))/math.sqrt(total)
    rounded_sd = round(100*standard_deviation)/100
    result = f"n={incidences}, M={rounded_proportion}%, SD={rounded_sd}"
    if label is None:
        return result
    else:
        print(label, result)
print(proportion_stats(50, 100))
proportion_stats(75, 103, 'Test Case')


def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False):
    '''
    Helper function that gives a quick summary of quantattive data
    Arguments
    =========
    dataframe: pandas dataframe
    x: str. horizontal axis to plot the labels of categorical data (usually the target variable)
    y: str. vertical axis to plot the quantitative data
    hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable)
    palette: array-like. Colour of the plot
    swarm: if swarm is set to True, a swarm plot would be overlayed
    Returns
    =======
    Quick Stats of the data and also the box plot of the distribution
    '''
    series = dataframe[y]
    print(series.describe())
    print('mode: ', series.mode())
    if verbose:
        print('='*80)
        print(series.value_counts())

    sns.boxplot(x=x, y=y, hue=hue, data=dataframe, palette=palette, ax=ax)

    if swarm:
        sns.swarmplot(x=x, y=y, hue=hue, data=dataframe,
                      palette=palette, ax=ax)

    plt.show()

## requirements.txt
tqdm
pandas
numpy
scipy
matplotlib
seaborn
numpy
jupyterlab
natsort
sklearn
statsmodels
	## Generally useful built libraries
	import sys
	import os
	import json
	import math
	import re
	import itertools
	from collections import defaultdict
	from dataclasses import dataclass
	from datetime import timedelta, datetime

	## Core Data Science Libraries

	# Main matplotlib usage is with `plt.whatever`
	import matplotlib.pyplot as plt
	# Sometimes you need style stuff via `mpl.whatever`
	import matplotlib as mpl
	# Statistic functions like `st.f_oneway`, `st.pearsonr`, etc.
	import scipy.stats as st
	# Your entire life is now pd.DataFrame(), pd.concat, etc.
	import pandas as pd
	# Real plotters user Seaborn over Matplotlib: sns.histplot, sns.jointplot, sns.lmplot, etc.
	import seaborn as sns
	# Sometimes you have to use numpy stuff, but almost always better to stay in Pandas
	import numpy as np

	## Quality of Life Libraries

	# Progress bars! Just do:
	# for an_item in tqdm(an_iterable):
	from tqdm.notebook import tqdm

	# Need to render HTML or Code? Combine them with `display`
	# Sometimes you will want to `print`, and sometimes `display`
	from IPython.display import display, HTML, Code

	## More Situational Libraries

	# Situational, but sometimes natsorting is helpful
	from natsort import index_natsorted, natsorted, order_by_index, natsort_keygen

	# Some more useful statistic related functions in here
	import sklearn.metrics as metrics

	# More stats stuff, mixed effects models, generalized linear models, ODEs, etc.
	import statsmodels as sm

	# Probably won't need to use hidden markov models
	#from hmmlearn import hmm



	### Style Stuff
	# Recommend putting this in a separate cell

	# At the minimum, make the background white so you can copy/paste into Discord :)
	mpl.rcParams['figure.facecolor'] = 'white'

	# Calculations I did one time for LaTeX papers, hope they're accurate..?
	COLUMN_WIDTH = 240/72.27
	TEXT_WIDTH = 504/72.27

	# I liked this style, but use your best judgement
	plt.style.use('seaborn-v0_8-whitegrid')

	# Setup nice fonts for an ACM LaTeX paper
	nice_fonts = {
	# Ideally, you should enable this if you have LaTeX installed
	# It'll make the graphs match the paper font much more closely.
	# "text.usetex": True,
	"text.usetex": False,

	"font.family": "serif",
	# Use 10pt font in plots, to match 10pt font in document
	"axes.labelsize": 10,
	"font.size": 10,
	# Make the legend/label fonts a little smaller
	"legend.fontsize": 8,
	"xtick.labelsize": 8,
	"ytick.labelsize": 8,
	}
	mpl.rcParams.update(nice_fonts)

	# Stop using scientific notation, show two decimal places instead.
	pd.set_option('display.float_format', '{:.2f}'.format)

	### Helper Functions

	def save_figure(filename, fig=None, folder='reports/figures/'):
	""" Helper function to quickly save figures for the paper. """
	if fig is None:
	fig = plt
	fig.savefig(os.path.join(folder, filename), format='pdf', bbox_inches='tight')

	def display_code(data):
	return display(Code(data))

	display_code("a = 0")

	def set_size(width, fraction=1, subplots=(1, 1)):
	""" Set figure dimensions to avoid scaling in LaTeX.

	Parameters
	----------
	width: float or string
	Document width in points, or string of predined document type
	fraction: float, optional
	Fraction of the width which you wish the figure to occupy
	subplots: array-like, optional
	The number of rows and columns of subplots.
	Returns
	-------
	fig_dim: tuple
	Dimensions of figure in inches
	"""
	if width == 'thesis':
	width_pt = 426.79135
	elif width == 'beamer':
	width_pt = 307.28987
	elif width == 'pnas':
	width_pt = 246.09686
	else:
	width_pt = width

	# Width of figure (in pts)
	fig_width_pt = width_pt * fraction
	# Convert from pt to inches
	inches_per_pt = 1 / 72.27

	# Golden ratio to set aesthetic figure height
	# https://disq.us/p/2940ij3
	golden_ratio = (5**.5 - 1) / 2

	# Figure width in inches
	fig_width_in = fig_width_pt * inches_per_pt
	# Figure height in inches
	fig_height_in = fig_width_in * golden_ratio * (subplots[0] / subplots[1])

	return (fig_width_in, fig_height_in)

	import re

	def tex_escape(text):
	"""
	:param text: a plain text message
	:return: the message escaped to appear correctly in LaTeX
	"""
	conv = {
	'&': r'\&',
	'%': r'\%',
	'$': r'\$',
	'#': r'\#',
	'_': r'\_',
	'{': r'\{',
	'}': r'\}',
	'~': r'\textasciitilde{}',
	'^': r'\^{}',
	'\\': r'\textbackslash{}',
	'<': r'\textless{}',
	'>': r'\textgreater{}',
	}
	regex = re.compile('\|'.join(re.escape(str(key)) for key in sorted(conv.keys(), key = lambda item: - len(item))))
	return regex.sub(lambda match: conv[match.group()], text)

	### Good Research Stuff
	# Choose a consistent alpha threshold for P-Value calculations
	ALPHA = .05

	def proportion_stats(incidences, total, label=None):
	""" Pretty prints a proportion with APA-style statistics. """
	proportion = incidences/total
	rounded_proportion = round(100proportion10)/10
	standard_deviation = math.sqrt(proportion*(1-proportion))/math.sqrt(total)
	rounded_sd = round(100*standard_deviation)/100
	result = f"n={incidences}, M={rounded_proportion}%, SD={rounded_sd}"
	if label is None:
	return result
	else:
	print(label, result)
	print(proportion_stats(50, 100))
	proportion_stats(75, 103, 'Test Case')


	def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False):
	'''
	Helper function that gives a quick summary of quantattive data
	Arguments
	=========
	dataframe: pandas dataframe
	x: str. horizontal axis to plot the labels of categorical data (usually the target variable)
	y: str. vertical axis to plot the quantitative data
	hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable)
	palette: array-like. Colour of the plot
	swarm: if swarm is set to True, a swarm plot would be overlayed
	Returns
	=======
	Quick Stats of the data and also the box plot of the distribution
	'''
	series = dataframe[y]
	print(series.describe())
	print('mode: ', series.mode())
	if verbose:
	print('='*80)
	print(series.value_counts())

	sns.boxplot(x=x, y=y, hue=hue, data=dataframe, palette=palette, ax=ax)

	if swarm:
	sns.swarmplot(x=x, y=y, hue=hue, data=dataframe,
	palette=palette, ax=ax)

	plt.show()
	tqdm
	pandas
	numpy
	scipy
	matplotlib
	seaborn
	numpy
	jupyterlab
	natsort
	sklearn
	statsmodels