Anton Osika AntonOsika

## gist:51b39ff2626abef8003a615ccaeb6ffd

import json
import os

def writeHTML(j, f):
    f.write(j["fragment"]["bodyHTML"])
    for comment in j["comments"]:
        f.write(comment["bodyHTML"])

def main(args):

## create_stackdriver_sinks.sh

# Filter out a specific message type and store it to gcs as schemaless .jsonl and bq (which requires that schema does not change).
# Different stackdriver labels creates different partitions of the output.
# I recommend adding a version field to each log so that you can easily filter them to different sinks.

filter="jsonPayload.message_type=\"$1\""
PROJECT_NAME=my_proj

gsutil mb gs://$PROJECT_NAME-logs-$1
gcloud beta logging sinks create $1-to-gcs storage.googleapis.com/$PROJECT_NAME-logs-$1 --log-filter="$filter" --project=$PROJECT_NAME

## log_mutations.js
// Select the node that will be observed for mutations
// Options for the observer (which mutations to observe)
const config = {attributes: true, childList: true, subtree: true};

const htmlify = xs => xs && [...xs].map(x => x.innerHTML)

let counter = 0

// Callback function to execute when mutations are observed
const callback = function (mutationsList, observer) {

## batch_different_length_timeseries.py
def generate_sequences(X, y, mask, batch_size=32, seed=0):
    """
    Returns a generator of batched timeseries padded to the longest sequence in the batch,
    using right zero padding.
    Can be used directly with model.fit_generator if X-keys matches keras Input tensors.
    Note that masks typically have 1 dimension less than labels.

    Args:
        X, list. Each element is a dictionary of 'feature_name': np.array
        y, list. Each element is np.array of labels

## EDA_config.py
import pandas as pd
import seaborn as sns
import matplotlib

pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 150

sns.set_style("whitegrid")

## label_encoder_for_production.py
import bisect

from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import column_or_1d

import numpy as np


class CategoricalTransform(LabelEncoder):

## practical_argparse.py
import argparse

def parsed_arguments(defaults: dict) -> argparse.Namespace:

    """
    Sets all variables defined in default_args() as command line arguments.
    (e.g. specify job_dir with: --job-dir [job_dir])

    Args:
        defaults: dict of default argument names and values. Any `_` will be replaced with `-`.

## excluded_datetimes.py
from __future__ import print_function

import datetime


def excluded_datetimes(start_str):
    """
    Creates regex matching every datetime chronologically before start_str.
    Can be used to exclude files/folders with e.g. rsync file downloads.
    """

## downloadURLs.py

import urllib
import os
import re

##############################
# Downloads files for every link it finds.
# The URLs can be handpicked with regex fileURLs.
##############################

	import json
	import os

	def writeHTML(j, f):
	f.write(j["fragment"]["bodyHTML"])
	for comment in j["comments"]:
	f.write(comment["bodyHTML"])

	def main(args):

	# Filter out a specific message type and store it to gcs as schemaless .jsonl and bq (which requires that schema does not change).
	# Different stackdriver labels creates different partitions of the output.
	# I recommend adding a version field to each log so that you can easily filter them to different sinks.

	filter="jsonPayload.message_type=\"$1\""
	PROJECT_NAME=my_proj

	gsutil mb gs://$PROJECT_NAME-logs-$1
	gcloud beta logging sinks create $1-to-gcs storage.googleapis.com/$PROJECT_NAME-logs-$1 --log-filter="$filter" --project=$PROJECT_NAME
	// Select the node that will be observed for mutations
	// Options for the observer (which mutations to observe)
	const config = {attributes: true, childList: true, subtree: true};

	const htmlify = xs => xs && [...xs].map(x => x.innerHTML)

	let counter = 0

	// Callback function to execute when mutations are observed
	const callback = function (mutationsList, observer) {
	def generate_sequences(X, y, mask, batch_size=32, seed=0):
	"""
	Returns a generator of batched timeseries padded to the longest sequence in the batch,
	using right zero padding.
	Can be used directly with model.fit_generator if X-keys matches keras Input tensors.
	Note that masks typically have 1 dimension less than labels.

	Args:
	X, list. Each element is a dictionary of 'feature_name': np.array
	y, list. Each element is np.array of labels
	import pandas as pd
	import seaborn as sns
	import matplotlib

	pd.options.display.float_format = '{:,.2f}'.format
	pd.options.display.max_columns = 999
	pd.options.display.max_colwidth = 150

	sns.set_style("whitegrid")
	import bisect

	from sklearn.preprocessing import LabelEncoder
	from sklearn.utils.validation import check_is_fitted
	from sklearn.utils import column_or_1d

	import numpy as np


	class CategoricalTransform(LabelEncoder):
	import argparse

	def parsed_arguments(defaults: dict) -> argparse.Namespace:

	"""
	Sets all variables defined in default_args() as command line arguments.
	(e.g. specify job_dir with: --job-dir [job_dir])

	Args:
	defaults: dict of default argument names and values. Any `_` will be replaced with `-`.
	from __future__ import print_function

	import datetime


	def excluded_datetimes(start_str):
	"""
	Creates regex matching every datetime chronologically before start_str.
	Can be used to exclude files/folders with e.g. rsync file downloads.
	"""

	import urllib
	import os
	import re

	##############################
	# Downloads files for every link it finds.
	# The URLs can be handpicked with regex fileURLs.
	##############################