Tony Fraser tonythor

## simple_conditional_pandas_column_using_lambda.py
# load random weblog data
columns = ['accept_language', 'domain', 'geo_city', 'geo_country','post_mobiledevice', 'post_mobileosversion']
s3.load(full_path='{bucket}/tfraser/{weblog}/{folder}/',
          file_type='csv',
          file_filter=".csv"
          )[columns].dropna(how='any').copy()

#  data looks like this.
#  accept_language     domain geo_city geo_country post_mobiledevice post_mobileosversion
#0           en-us     rr.com   austin         usa           iPad4,2           iOS 11.1.2

## seaborn_on_ipython.py
# this is for pip3 and pip3 ipython, you should ave these installed and be able to run.
# thunder:~ user$ pip3 install seaborn ipython matplotlib

################### Using mathplotlib  ################
# thunder:~ user$ ipython
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
# %matplotlib inline # <- don't do this, your terminal can't render this. You need the popups.
titanic = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

## simple_indexes.py
import pandas as pd
from numpy import randn

rows = ['a','b','c','d','e']
cols = ['w','x','y','z']
df = pd.DataFrame(randn(5,4), rows, cols)

#          w         x         y         z
# a  2.706850  0.628133  0.907969  0.503826
# b  0.651118 -0.319318 -0.848077  0.605965

## list_map_lambda_filter_easy.py
# List Comprehension / Map / Lambda Fucntions Explained SUPER EASY

# say you have a list of files and want to work with the extensions.
files = ['tony.txt', 'fraser.csv', 'ex.xls']

# it could be a function, you could loop through it.
def get_suffix(file:str):
    return file.split('.')[1]
# for file in files: print(get_suffix(file))

## UseDariaToMakeExcelSafeCSV.scala
import com.github.mrpowers.spark.daria.sql.transformations
import scala.annotation.tailrec
// import other stuff related to spark

val DefaultReplacements = Map(
    "'" -> "\\'",
    "\"" -> "\\'",
    "," -> "\\,")

// if you wanted to pass in a list of columns, say all columns in a DF, you could replace like so.

## emptyToNullUdf.scala
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
// Usage: df.select(df.columns.map(c => emptyToNullUdf(col(c)).alias(c)): _*)
def emptyToNull(_str: String): Option[String] = {
  _str match {
    case d if (_str == null || _str.trim.isEmpty) => None
    case _ => Some(_str)
  }
}
val emptyToNullUdf = udf(emptyToNull(_: String))

## ZeppelinService.scala
package com.gimmesome.zeppelin

import com.softwaremill.sttp._
import scala.util.parsing.json.JSON

// case class ZeppelinConfig (instance: String, baseUrl: String, authLoginUrl: String, authUid: String, authPass: String)

// Usage:
//  import something.ZeppelinService
//  val notebook = "2E6T7JZX1"

## zeppelin_test.sh
#!/bin/bash

# -> remember to run: dcos auth login first !!
DCOS_API_TOKEN=$(dcos config show core.dcos_acs_token)
url="http://{marathon-domain}/service/{marathon zeppelin name}"
notebook="2E617JZX1" # $url/#/notebook/2E617JZX1
paragraph="20190916-164803_817623738"
#Note: to get paragraph ID, download notebook, open json and look for -> paragraphs -> Item [N] -> id.

curl --request GET -s -H "Content-Type: application/json" -H "Authorization: token=$DCOS_API_TOKEN" $url/api/notebook

## recursivefunctionexec.scala
import scala.annotation.tailrec
import scala.concurrent.duration.Duration
import scala.util.Random

// the function we'll run until true
def myFunction(): Boolean = {
  val rand = Random.nextInt()
  if (rand % 10 == 0) {
    print(s"${rand} is divisible by 10\n")
    true

## Dockerfile
FROM python:3.8

ARG AIRFLOW_VERSION=1.10.12
ARG AIRFLOW_USER_HOME=/usr/local/airflow
ARG AIRFLOW_DEPS=""
ARG PYTHON_DEPS=""
ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME}

COPY ./requirements.txt /requirements.txt
	# load random weblog data
	columns = ['accept_language', 'domain', 'geo_city', 'geo_country','post_mobiledevice', 'post_mobileosversion']
	s3.load(full_path='{bucket}/tfraser/{weblog}/{folder}/',
	file_type='csv',
	file_filter=".csv"
	)[columns].dropna(how='any').copy()

	# data looks like this.
	# accept_language domain geo_city geo_country post_mobiledevice post_mobileosversion
	#0 en-us rr.com austin usa iPad4,2 iOS 11.1.2
	# this is for pip3 and pip3 ipython, you should ave these installed and be able to run.
	# thunder:~ user$ pip3 install seaborn ipython matplotlib

	################### Using mathplotlib ################
	# thunder:~ user$ ipython
	import seaborn as sns
	import pandas as pd
	import matplotlib.pyplot as plt
	# %matplotlib inline # <- don't do this, your terminal can't render this. You need the popups.
	titanic = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
	import pandas as pd
	from numpy import randn

	rows = ['a','b','c','d','e']
	cols = ['w','x','y','z']
	df = pd.DataFrame(randn(5,4), rows, cols)

	# w x y z
	# a 2.706850 0.628133 0.907969 0.503826
	# b 0.651118 -0.319318 -0.848077 0.605965
	# List Comprehension / Map / Lambda Fucntions Explained SUPER EASY

	# say you have a list of files and want to work with the extensions.
	files = ['tony.txt', 'fraser.csv', 'ex.xls']

	# it could be a function, you could loop through it.
	def get_suffix(file:str):
	return file.split('.')[1]
	# for file in files: print(get_suffix(file))
	import com.github.mrpowers.spark.daria.sql.transformations
	import scala.annotation.tailrec
	// import other stuff related to spark

	val DefaultReplacements = Map(
	"'" -> "\\'",
	"\"" -> "\\'",
	"," -> "\\,")

	// if you wanted to pass in a list of columns, say all columns in a DF, you could replace like so.
	import org.apache.spark.sql.expressions.UserDefinedFunction
	import org.apache.spark.sql.functions.udf
	// Usage: df.select(df.columns.map(c => emptyToNullUdf(col(c)).alias(c)): _*)
	def emptyToNull(_str: String): Option[String] = {
	_str match {
	case d if (_str == null \|\| _str.trim.isEmpty) => None
	case _ => Some(_str)
	}
	}
	val emptyToNullUdf = udf(emptyToNull(_: String))
	package com.gimmesome.zeppelin

	import com.softwaremill.sttp._
	import scala.util.parsing.json.JSON

	// case class ZeppelinConfig (instance: String, baseUrl: String, authLoginUrl: String, authUid: String, authPass: String)

	// Usage:
	// import something.ZeppelinService
	// val notebook = "2E6T7JZX1"
	#!/bin/bash

	# -> remember to run: dcos auth login first !!
	DCOS_API_TOKEN=$(dcos config show core.dcos_acs_token)
	url="http://{marathon-domain}/service/{marathon zeppelin name}"
	notebook="2E617JZX1" # $url/#/notebook/2E617JZX1
	paragraph="20190916-164803_817623738"
	#Note: to get paragraph ID, download notebook, open json and look for -> paragraphs -> Item [N] -> id.

	curl --request GET -s -H "Content-Type: application/json" -H "Authorization: token=$DCOS_API_TOKEN" $url/api/notebook
	import scala.annotation.tailrec
	import scala.concurrent.duration.Duration
	import scala.util.Random

	// the function we'll run until true
	def myFunction(): Boolean = {
	val rand = Random.nextInt()
	if (rand % 10 == 0) {
	print(s"${rand} is divisible by 10\n")
	true
	FROM python:3.8

	ARG AIRFLOW_VERSION=1.10.12
	ARG AIRFLOW_USER_HOME=/usr/local/airflow
	ARG AIRFLOW_DEPS=""
	ARG PYTHON_DEPS=""
	ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME}

	COPY ./requirements.txt /requirements.txt