David Yerrington dyerrington

## read_clipboard.py
import pandas as pd
df = pd.read_clipboard()

## basic_ols_numpy_example.py
import numpy as np
import sys

# lines = input.split("\n")
lines = sys.stdin.readlines()

train_header = lines[0].split()
n_train_features, n_train_observations = int(train_header[0]), int(train_header[1])
training = np.array([row.split() for row in lines[1:n_train_observations]], dtype = float)
X_train, y_train = training[:, 0:n_train_features], training[:, n_train_features:]

## pearson.py
import pandas as pd
import math

lines = [line.split() for line in input.split("\n") if len(line)]

X, y = [[int(score) for score in scores] for index, (variable, _, *scores) in enumerate(lines)]

n = len(X)
sum_X, sum_y = sum(X), sum(y)
sum_Xy = sum([X[i] * y[i] for i in range(len(X))])

## test_ttest_power_diff.py
from statsmodels.stats.power import  tt_ind_solve_power
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt

def test_ttest_power_diff(mean, std, sample1_size=None, alpha=0.05, desired_power=0.8, mean_diff_percentages=[0.1, 0.05]):
    '''
    calculates the power function for a given mean and std. the function plots a graph showing the comparison between desired mean differences
    :param mean: the desired mean
    :param std: the std value
    :param sample1_size: if None, it is assumed that both samples (first and second) will have same size. The function then will

## polar_plot.py
from math import pi
from mpl_toolkits.axes_grid.inset_locator import inset_axes

# Set data
df = pd.DataFrame({
    # 'group': ['A','B','C','D'],
    'var1': [38, 1.5, 30, 4],
    'var2': [29, 10, 9, 34],
    'var3': [8, 39, 23, 24],
    'var4': [7, 31, 33, 14]

## generate_udf_js_big_query.py
# fighting == most common event type

def build_udf_prototype(event_types):

    null = "null" # default all types to null in the UDF function
    PIVOT_FEATURES = str({"col_" + event_name.replace("-", "_"): null for event_name in event_types.tolist()}).replace("'null'", "null")
    SQL_RETURN = "STRUCT<"
    for event_type in event_types.tolist():
        event_type = event_type.replace("-", "_")
        SQL_RETURN += f"col_{event_type} INT64, "

## hiring_guidelines.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              2 stars
            
          
                dyerrington
                / hiring_guidelines.md
            
            
              Created
              July 17, 2019 20:27
            
          
    Great Data Science Project Criteria:


Problem statement that defines a measurable, and/or falsifiable outcome.  “Frequency of [specific event] is influential over [some outcome]”. “Users who use [some feature in app] are differentiable from users who less frequently use [some feature in app]”. etc.  If you can’t frame a data problem properly, none of has it has purpose.  The biggest challenge in data science is making sense and defining the gray area of business problems.  This also comes with experience.
EDA EDA EDA.  Define your scope.  Report only what is necessary and relevant to your problem statement.  If the model reports only 4-5 common variables as parameters (logistic regression for instance), focus on those when summarizing your work in terms of EDA.
How much data is necessary to make this analysis work?  Are you sampling?  Is a t-test necessary to gain assurance or a rank order test?
Explain which model makes the most sense to use. Are you trying to gain inference about a data problem?


## sf_slicing_apply_map.ipynb

      
              1 file
            
          
              3 forks
            
          
              0 comments
            
          
              0 stars
            
          
                dyerrington
                / sf_slicing_apply_map.ipynb
            
            
              Created
              March 9, 2019 01:04
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## parse_jupyter.md

      
              2 files
            
          
              1 fork
            
          
              0 comments
            
          
              1 star
            
          
                dyerrington
                / parse_jupyter.md
            
            
              Created
              October 16, 2018 19:55
            
          
    Parse Jupyter

This is a basic class that makes it convenient to parse notebooks.  I built a larger version of this that was used for clustering documents to create symantic indeices that linked related content together for a personal project.  You can use this to parse notebooks for doing things like NLP or preprocessing.
Usage

parser = ParseJupyter("./Untitled.ipynb")
parser.get_cells(source_only = True, source_as_string = True)


## machine_learning_flashcards.py
import tweepy
import wget
import os

oauth = {
    "consumer_key":        "",
    "consumer_secret":     ""
}

access = {
	import numpy as np
	import sys

	# lines = input.split("\n")
	lines = sys.stdin.readlines()

	train_header = lines[0].split()
	n_train_features, n_train_observations = int(train_header[0]), int(train_header[1])
	training = np.array([row.split() for row in lines[1:n_train_observations]], dtype = float)
	X_train, y_train = training[:, 0:n_train_features], training[:, n_train_features:]
	import pandas as pd
	import math

	lines = [line.split() for line in input.split("\n") if len(line)]

	X, y = [[int(score) for score in scores] for index, (variable, _, *scores) in enumerate(lines)]

	n = len(X)
	sum_X, sum_y = sum(X), sum(y)
	sum_Xy = sum([X[i] * y[i] for i in range(len(X))])
	from statsmodels.stats.power import tt_ind_solve_power
	from scipy.interpolate import interp1d
	import matplotlib.pyplot as plt

	def test_ttest_power_diff(mean, std, sample1_size=None, alpha=0.05, desired_power=0.8, mean_diff_percentages=[0.1, 0.05]):
	'''
	calculates the power function for a given mean and std. the function plots a graph showing the comparison between desired mean differences
	:param mean: the desired mean
	:param std: the std value
	:param sample1_size: if None, it is assumed that both samples (first and second) will have same size. The function then will
	from math import pi
	from mpl_toolkits.axes_grid.inset_locator import inset_axes

	# Set data
	df = pd.DataFrame({
	# 'group': ['A','B','C','D'],
	'var1': [38, 1.5, 30, 4],
	'var2': [29, 10, 9, 34],
	'var3': [8, 39, 23, 24],
	'var4': [7, 31, 33, 14]
	# fighting == most common event type

	def build_udf_prototype(event_types):

	null = "null" # default all types to null in the UDF function
	PIVOT_FEATURES = str({"col_" + event_name.replace("-", "_"): null for event_name in event_types.tolist()}).replace("'null'", "null")
	SQL_RETURN = "STRUCT<"
	for event_type in event_types.tolist():
	event_type = event_type.replace("-", "_")
	SQL_RETURN += f"col_{event_type} INT64, "
	import tweepy
	import wget
	import os

	oauth = {
	"consumer_key": "",
	"consumer_secret": ""
	}

	access = {