Skip to content

Instantly share code, notes, and snippets.

I may be slow to respond.

David Yerrington dyerrington

I may be slow to respond.
View GitHub Profile
View gist:6850e459c37b3f01e4049505c1634256
def fetch_states():
Return all <option> values for the <select> element with all states.
data = {}
states = tree.xpath('//select[@name="state"]')
for state in states[0].xpath('option'):
data[state.attrib['value']] = state.text_content()
import requests, re
def test_station_data_availability(station_id):
for year in range(1960, 2020 + 1):
r = requests.get(f"{year}/")
matches ="href=\"([0-9]{6}" +str(station_id) + ".csv)", r.text)
if matches:
print(station_id, " data exists for ", year)
print(station_id, " data not found for ", year)
View alcohol_correlation.ipynb
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View set_correlation_matrix_identity,py
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
# Load example wine dataset from sklearn
data = load_wine()
# Create a basic DataFrame
df = pd.DataFrame(data['data'], columns = data['feature_names'])
import pandas as pd
df = pd.read_clipboard()
dyerrington /
Last active Feb 7, 2020
Ordinary least squares implemented with numpy.
import numpy as np
import sys
# lines = input.split("\n")
lines = sys.stdin.readlines()
train_header = lines[0].split()
n_train_features, n_train_observations = int(train_header[0]), int(train_header[1])
training = np.array([row.split() for row in lines[1:n_train_observations]], dtype = float)
X_train, y_train = training[:, 0:n_train_features], training[:, n_train_features:]
dyerrington /
Created Feb 6, 2020
Code pearson correlation coefficient from scratch
import pandas as pd
import math
lines = [line.split() for line in input.split("\n") if len(line)]
X, y = [[int(score) for score in scores] for index, (variable, _, *scores) in enumerate(lines)]
n = len(X)
sum_X, sum_y = sum(X), sum(y)
sum_Xy = sum([X[i] * y[i] for i in range(len(X))])
from statsmodels.stats.power import tt_ind_solve_power
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
def test_ttest_power_diff(mean, std, sample1_size=None, alpha=0.05, desired_power=0.8, mean_diff_percentages=[0.1, 0.05]):
calculates the power function for a given mean and std. the function plots a graph showing the comparison between desired mean differences
:param mean: the desired mean
:param std: the std value
:param sample1_size: if None, it is assumed that both samples (first and second) will have same size. The function then will
dyerrington /
Created Nov 7, 2019
Basic implementation of a matplotlib polar plot using a basic observations with multiple variables.
from math import pi
from mpl_toolkits.axes_grid.inset_locator import inset_axes
# Set data
df = pd.DataFrame({
# 'group': ['A','B','C','D'],
'var1': [38, 1.5, 30, 4],
'var2': [29, 10, 9, 34],
'var3': [8, 39, 23, 24],
'var4': [7, 31, 33, 14]
dyerrington /
Created Sep 18, 2019
Python code that will create, essentially a pivot from a nested big query set. Based on the original method in the google big query documentation.
# fighting == most common event type
def build_udf_prototype(event_types):
null = "null" # default all types to null in the UDF function
PIVOT_FEATURES = str({"col_" + event_name.replace("-", "_"): null for event_name in event_types.tolist()}).replace("'null'", "null")
for event_type in event_types.tolist():
event_type = event_type.replace("-", "_")
SQL_RETURN += f"col_{event_type} INT64, "
You can’t perform that action at this time.