Miklos C mrchristine

## find_clones.py
# $ cat nb_names.log | sort | uniq -c | sort -nrk1 | head

import os, re

# find cloned notebooks with parens
pattern = re.compile(r"\((\d+)\)")

with open('user_workspace.log', 'r') as fp, open('nb_names.log', 'w') as fp_w:
    for x in fp:
        nb_name = os.path.basename(x.rstrip())

## get_spark_ui.py
ui_port = spark.sql("set spark.ui.port").collect()[0].value

env = "myenvironment.cloud.databricks.com"

cluster_id = dbutils.notebook.entry_point.getDbutils().notebook().getContext().clusterId().getOrElse(None)
url = "https://{0}/driver-proxy-api/o/0/{1}/{2}/api/v1/".format(env, cluster_id, ui_port)

import requests
token = "TOKEN"

## iam.py
import requests

token = 'MYTOKEN'
url = 'https://EXAMPLE.cloud.databricks.com'

ip = 'arn:aws:iam::123456789:instance-profile/databricks_special_role'

class DatabricksRestClient:
    """A class to define wrappers for the REST API"""

## get_s3_storage_costs.sh
#!/bin/bash

# get the last date in the file
last_date=`cat $@ | awk -F',' '{print $5}' | awk '{print $1}' | grep -v "Start" | sort | uniq | tail -n1`
# pass in the report.csv and calculate total storage costs for StandardStorage tier
cat "$@" | grep $last_date | awk -F, '{printf "%.2f GB %s %s \n", $7/(1024**3 )/24, $4, $2}' | grep "StandardStorage" | uniq | sort -n
echo "Processed for $last_date"

## decode_aws_error.sh
#!/bin/bash

# grab decoded error message
error=`aws sts decode-authorization-message --encoded-message $@ | jq .DecodedMessage`
# trim the start and end double quotes
json_err=${error:1: -1}
# remove escaped quoted strings and pretty print with jq
echo $json_err | sed 's|\\"|"|g' | jq .

## spark_stuff.scala
spark.conf.isModifiable("spark.sql.shuffle.partitions")

## spark_schema_save_n_load.py
##### READ SPARK DATAFRAME
df = spark.read.option("header", "true").option("inferSchema", "true").csv(fname)
# store the schema from the CSV w/ the header in the first file, and infer the types for the columns
df_schema = df.schema

##### SAVE JSON SCHEMA INTO S3 / BLOB STORAGE
# save the schema to load from the streaming job, which we will load during the next job
dbutils.fs.rm("/home/mwc/airline_schema.json", True)

with open("/dbfs/home/mwc/airline_schema.json", "w") as f:

## update_legacy_job_templates.py
import json, pprint, requests, datetime

################################################################
## Replace the token variable and environment url below
################################################################

# Helper to pretty print json
def pprint_j(i):
  print json.dumps(i, indent=4, sort_keys=True)

## vector_sum_udaf.scala
package com.databricks.example.pivot

/**
This code allows a user to add vectors together for common keys.
The code in the comments show you how to register the scala UDAF to be called from pyspark.
The UDAF can only be called from a SQL expression (aka spark.sql() or df.expr() )
**/

/**
# Python code to register a scala UDAF

## spark-submit-run-once.sh
#!/bin/bash

usage="Add jars to the input arguments to specify the spark job. -h list the supported spark versions"

RUNTIME_VERSION="3.2.x-scala2.11"
NODE_TYPE="r3.xlarge"

while getopts ':hs:' option; do
  case "$option" in
    h) echo "$usage"
	# $ cat nb_names.log \| sort \| uniq -c \| sort -nrk1 \| head

	import os, re

	# find cloned notebooks with parens
	pattern = re.compile(r"\((\d+)\)")

	with open('user_workspace.log', 'r') as fp, open('nb_names.log', 'w') as fp_w:
	for x in fp:
	nb_name = os.path.basename(x.rstrip())
	ui_port = spark.sql("set spark.ui.port").collect()[0].value

	env = "myenvironment.cloud.databricks.com"

	cluster_id = dbutils.notebook.entry_point.getDbutils().notebook().getContext().clusterId().getOrElse(None)
	url = "https://{0}/driver-proxy-api/o/0/{1}/{2}/api/v1/".format(env, cluster_id, ui_port)

	import requests
	token = "TOKEN"
	import requests

	token = 'MYTOKEN'
	url = 'https://EXAMPLE.cloud.databricks.com'

	ip = 'arn:aws:iam::123456789:instance-profile/databricks_special_role'

	class DatabricksRestClient:
	"""A class to define wrappers for the REST API"""
	#!/bin/bash

	# get the last date in the file
	last_date=`cat $@ \| awk -F',' '{print $5}' \| awk '{print $1}' \| grep -v "Start" \| sort \| uniq \| tail -n1`
	# pass in the report.csv and calculate total storage costs for StandardStorage tier
	cat "$@" \| grep $last_date \| awk -F, '{printf "%.2f GB %s %s \n", $7/(1024**3 )/24, $4, $2}' \| grep "StandardStorage" \| uniq \| sort -n
	echo "Processed for $last_date"
	#!/bin/bash

	# grab decoded error message
	error=`aws sts decode-authorization-message --encoded-message $@ \| jq .DecodedMessage`
	# trim the start and end double quotes
	json_err=${error:1: -1}
	# remove escaped quoted strings and pretty print with jq
	echo $json_err \| sed 's\|\\"\|"\|g' \| jq .
	##### READ SPARK DATAFRAME
	df = spark.read.option("header", "true").option("inferSchema", "true").csv(fname)
	# store the schema from the CSV w/ the header in the first file, and infer the types for the columns
	df_schema = df.schema

	##### SAVE JSON SCHEMA INTO S3 / BLOB STORAGE
	# save the schema to load from the streaming job, which we will load during the next job
	dbutils.fs.rm("/home/mwc/airline_schema.json", True)

	with open("/dbfs/home/mwc/airline_schema.json", "w") as f:
	import json, pprint, requests, datetime

	################################################################
	## Replace the token variable and environment url below
	################################################################

	# Helper to pretty print json
	def pprint_j(i):
	print json.dumps(i, indent=4, sort_keys=True)
	package com.databricks.example.pivot

	/**
	This code allows a user to add vectors together for common keys.
	The code in the comments show you how to register the scala UDAF to be called from pyspark.
	The UDAF can only be called from a SQL expression (aka spark.sql() or df.expr() )
	**/

	/**
	# Python code to register a scala UDAF
	#!/bin/bash

	usage="Add jars to the input arguments to specify the spark job. -h list the supported spark versions"

	RUNTIME_VERSION="3.2.x-scala2.11"
	NODE_TYPE="r3.xlarge"

	while getopts ':hs:' option; do
	case "$option" in
	h) echo "$usage"