Skip to content

Instantly share code, notes, and snippets.

View crawles's full-sized avatar

Chris Rawles crawles

View GitHub Profile
@crawles
crawles / bq_load_tsv.sh
Created June 13, 2018 16:40
How to load a TSV file into BigQuery
# Can work for other delimiters as well
# Tab delimiter
bq load --source_format=CSV --field_delimiter=tab \
--skip_leading_rows 1 -<destination_table> <source> \
<schema>
@crawles
crawles / Spark Dataframe Cheat Sheet.py
Last active April 26, 2022 03:09 — forked from evenv/Spark Dataframe Cheat Sheet.py
Cheat sheet for Spark Dataframes (using Python)
# A simple cheat sheet of Spark Dataframe syntax
# Current for Spark 1.6.1
# import statements
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
#creating dataframes
df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data
@crawles
crawles / plpython_decorator.py
Last active February 7, 2021 06:25
Create a decorator for plpython, allowing you to create a pl/python function as you would a normal function
import inspect
import pandas.io.sql as psql
def parametrized(dec):
def layer(*args, **kwargs):
"""
Wrapper for creating plpython functions. Argument types are provided in the docstring
:param conn: Psycopg connection object
explanation_metadata = {
"inputs": {
"dayofweek": {
"input_tensor_name": "dayofweek:0",
"input_baselines": [baselines_mode[0][0]] # Thursday
},
"hourofday": {
"input_tensor_name": "hourofday:0",
"input_baselines": [baselines_mode[0][1]] # 8pm
},
@crawles
crawles / python_head_query.py
Created February 14, 2020 15:34
Run a preview in bigquery using the `head` command. Save the result to a pandas df.
from io import StringIO
def head(table, n=10):
head_list = !bq head --n $n --table $table
head_str = '\n'.join([head_list[1]] + head_list[3:-1])
return pd.read_csv(StringIO(head_str), delimiter="|").iloc[:, 1:-1]
df = head('publicdata:samples.natality')
def _numeric_column_normalized(column_name, normalizer_fn):
return tf.feature_column.numeric_column(column_name, normalizer_fn=normalizer_fn)
def _make_zscaler(mean, std):
def zscaler(col):
return (col - mean)/std
return zscaler
# Define your feature columns
def create_feature_cols(features, use_normalization):
import numpy as np
import pandas as pd
import tensorflow as tf
tf.enable_eager_execution()
# Load dataset.
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')
def permutation_importances(est, X_eval, y_eval, metric, features):
"""Column by column, shuffle values and observe effect on eval set.
source: http://explained.ai/rf-importance/index.html
A similar approach can be done during training. See "Drop-column importance"
in the above article."""
def accuracy_metric(est, X, y):
"""TensorFlow estimator accuracy."""
eval_input_fn = make_input_fn(X,
y=y,
shuffle=False,
# Get importances
importances = est.experimental_feature_importances(normalize=True)
df_imp = pd.Series(importances)
# Visualize importances.
N = 8
ax = (df_imp.iloc[0:N][::-1]
.plot(kind='barh'))
# Make predictions.
pred_dicts = list(est.experimental_predict_with_explanations(eval_input_fn))
df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts])
# Plot results.
ID = 182
example = df_dfc.iloc[ID] # Choose ith example from evaluation set.
TOP_N = 8 # View top 8 features.
sorted_ix = example.abs().sort_values()[-TOP_N:].index
ax = example[sorted_ix].plot(kind='barh')