tonyfraser

## marathonAndZeppelin.scala
    //uses sttp module
    import com.softwaremill.sttp.{HttpURLConnectionBackend, _}
    import scala.util.parsing.json._

    implicit lazy val backend = HttpURLConnectionBackend()


    //first get a marathon bearer token.
    val loginPostBody = "{ \"uid\": \"{username}\", \"password\": \"{password}\" }"
    val tok = JSON.parseFull(

## zeppelin_test.sh
#!/bin/bash

# -> remember to run: dcos auth login first !!
DCOS_API_TOKEN=$(dcos config show core.dcos_acs_token)
url="http://{marathon-domain}/service/{marathon zeppelin name}"
notebook="2E617JZX1" # $url/#/notebook/2E617JZX1
paragraph="20190916-164803_817623738"
#Note: to get paragraph ID, download notebook, open json and look for -> paragraphs -> Item [N] -> id.

curl --request GET -s -H "Content-Type: application/json" -H "Authorization: token=$DCOS_API_TOKEN" $url/api/notebook

## emptyToNullUdf.scala
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
// Usage: df.select(df.columns.map(c => emptyToNullUdf(col(c)).alias(c)): _*)
def emptyToNull(_str: String): Option[String] = {
  _str match {
    case d if (_str == null || _str.trim.isEmpty) => None
    case _ => Some(_str)
  }
}
val emptyToNullUdf = udf(emptyToNull(_: String))

## addColumnIfDoesNotExist.scala
//An example of dynamically adding a column if it does not exist
val df = Seq(
  ("channel_one", "my_show", "episode1"),
  ("channel_one", "my_show", "episode2")
).toDF("network_name", "show_name", "episode")

//there is no rank column so add one
val newdf = df.columns match {
        case a if a contains "rank" => df
        case _ =>df.withColumn("rank", lit("0"))

## DataFrameConverter.py
# https://stackoverflow.com/questions/37513355/converting-pandas-dataframe-into-spark-dataframe-error/56895546#56895546
# modified from parent gist by creating a dict type that contains df.dtypes AND type(pd.columnname)
#
# Looks something like this.
#  {  'stringtypecolumn': {'dtype': 'object', 'actual': 'str'},
#     'act_num': {'dtype': 'int32', 'actual': 'numpy.int32'},
#     'text_dat': {'dtype': 'object', 'actual': 'list'},
#     'scene_description': {'dtype': 'object', 'actual': 'NoneType'},
#     'keywords': {'dtype': 'object', 'actual': 'list'}}
#

## Dockerfile
FROM openjdk:8
#python:3 -- doesn't have java, so switched to open jdk.
# ==> openjdk contains java 1.8, and is a debian image
# far easier to start with openjdk1.8 than to apt-get install default-jdk-11 or whatever it is.

WORKDIR /usr/src/app

# first get these jars into the docker container
# ~/thisGist/lib: tony$ ls -al
# total 23656

## githhub-clean-branch-history.log
  # Get your files ignored correctly on the file system and website, keep checking until
  # the latest commit is perfect.

  639  vi ./.gitignore
  640  git rm -r --cached .
  641  git add -A
  642  git commit -m "adding"
  643  git push origin master

  # now create a temp branch off master, then delete the master, and push master back up again.

## exercises.scala
// scala-and-spark-for-big-data-and-machine-learning

//Section 8 lesson 33

//Find out if you have all even numbers in a list.
List(0, 2, 4).
  map(_%2).sum == 0

//Lucky number 7 card problem, Add your cards, but double 7 if you get it.
List(0, 2, 5, 7).

## full-access-s3-subkey-policy.json
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Action": [
        "s3:ListAllMyBuckets",
        "s3:GetBucketLocation"
      ],
      "Effect": "Allow",
      "Resource": [

## read-only-s3-permission-subkey-policy.json
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Action": [
        "s3:ListAllMyBuckets",
        "s3:GetBucketLocation"
      ],
      "Effect": "Allow",
      "Resource": [
	//uses sttp module
	import com.softwaremill.sttp.{HttpURLConnectionBackend, _}
	import scala.util.parsing.json._

	implicit lazy val backend = HttpURLConnectionBackend()


	//first get a marathon bearer token.
	val loginPostBody = "{ \"uid\": \"{username}\", \"password\": \"{password}\" }"
	val tok = JSON.parseFull(
	#!/bin/bash

	# -> remember to run: dcos auth login first !!
	DCOS_API_TOKEN=$(dcos config show core.dcos_acs_token)
	url="http://{marathon-domain}/service/{marathon zeppelin name}"
	notebook="2E617JZX1" # $url/#/notebook/2E617JZX1
	paragraph="20190916-164803_817623738"
	#Note: to get paragraph ID, download notebook, open json and look for -> paragraphs -> Item [N] -> id.

	curl --request GET -s -H "Content-Type: application/json" -H "Authorization: token=$DCOS_API_TOKEN" $url/api/notebook
	import org.apache.spark.sql.expressions.UserDefinedFunction
	import org.apache.spark.sql.functions.udf
	// Usage: df.select(df.columns.map(c => emptyToNullUdf(col(c)).alias(c)): _*)
	def emptyToNull(_str: String): Option[String] = {
	_str match {
	case d if (_str == null \|\| _str.trim.isEmpty) => None
	case _ => Some(_str)
	}
	}
	val emptyToNullUdf = udf(emptyToNull(_: String))
	//An example of dynamically adding a column if it does not exist
	val df = Seq(
	("channel_one", "my_show", "episode1"),
	("channel_one", "my_show", "episode2")
	).toDF("network_name", "show_name", "episode")

	//there is no rank column so add one
	val newdf = df.columns match {
	case a if a contains "rank" => df
	case _ =>df.withColumn("rank", lit("0"))
	# https://stackoverflow.com/questions/37513355/converting-pandas-dataframe-into-spark-dataframe-error/56895546#56895546
	# modified from parent gist by creating a dict type that contains df.dtypes AND type(pd.columnname)
	#
	# Looks something like this.
	# { 'stringtypecolumn': {'dtype': 'object', 'actual': 'str'},
	# 'act_num': {'dtype': 'int32', 'actual': 'numpy.int32'},
	# 'text_dat': {'dtype': 'object', 'actual': 'list'},
	# 'scene_description': {'dtype': 'object', 'actual': 'NoneType'},
	# 'keywords': {'dtype': 'object', 'actual': 'list'}}
	#
	FROM openjdk:8
	#python:3 -- doesn't have java, so switched to open jdk.
	# ==> openjdk contains java 1.8, and is a debian image
	# far easier to start with openjdk1.8 than to apt-get install default-jdk-11 or whatever it is.

	WORKDIR /usr/src/app

	# first get these jars into the docker container
	# ~/thisGist/lib: tony$ ls -al
	# total 23656
	# Get your files ignored correctly on the file system and website, keep checking until
	# the latest commit is perfect.

	639 vi ./.gitignore
	640 git rm -r --cached .
	641 git add -A
	642 git commit -m "adding"
	643 git push origin master

	# now create a temp branch off master, then delete the master, and push master back up again.
	// scala-and-spark-for-big-data-and-machine-learning

	//Section 8 lesson 33

	//Find out if you have all even numbers in a list.
	List(0, 2, 4).
	map(_%2).sum == 0

	//Lucky number 7 card problem, Add your cards, but double 7 if you get it.
	List(0, 2, 5, 7).
	{
	"Version": "2012-10-17",
	"Statement": [
	{
	"Action": [
	"s3:ListAllMyBuckets",
	"s3:GetBucketLocation"
	],
	"Effect": "Allow",
	"Resource": [