Tony Fraser tonythor

## CsvStringToSparkDF.Scala
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._

object TestDataFrameBuilder extends {

  // **************************
  // val names =
  //      """tony,schmaser
  //        |fred,smith
  //        |reed,jerry""".stripMargin

## todcr.sh
#!/bin/bash
# EMR in targeted odcrs (Targeted On Demand Capacity Reservation)
# THIS IS NOT A RUNNABLE SCRIPT -> IT WAS DESIGNED FOR CUTTING AND PASTING FOR DEMO


# Do this: https://docs.aws.amazon.com/emr/latest/ManagementGuide/on-demand-capacity-reservations.html
# Youtbue Demo: https://www.youtube.com/watch?v=WYWSFb5wZuo


mnode="r4.xlarge"      # master node

## basic-python-stats.py
f = [66,67,67,68,68,68,68,69,69,69,69,70,70,71,71,72,73,75]
from numpy import percentile
def five_number_summary(data):
     quartiles = percentile(data, [25, 50, 75])
     print(data)
     print('Min:    %.3f' % min(data))
     print('Q1:     %.3f' % quartiles[0])
     print('Median: %.3f' % quartiles[1])
     print('Q3:     %.3f' % quartiles[2])
     print('Max:    %.3f' % max(data))

## pandas_strings_ to_dates.py
import pandas as pd
df = pd.read_csv('./nogit_dataset.csv')
df.head(5)

#  StringDate  Product  Store          Value
#0    1012018  2667437  QLD_CW_ST0203  2926.000
#1    2012018  2667437  QLD_CW_ST0203  2687.531
#2    3012018  2667437  QLD_CW_ST0203  2793.000
#3    4012018  2667437  QLD_CW_ST0203  2394.000
#4    5012018  2667437  QLD_CW_ST0203  2660.000

## aws-s3-ls-output-to-df.scala
## if you have a bunch of `aws s3 ls > $date.txt"`` files in a directory,
## you can load them into a dataframe to look at them. Of course you can use the hadoop api instead,
## but this is quick and dirty and works if you're trying to troubleshoot to see if a feed is working.

import scala.util.Try
import org.apache.spark.sql.types.TimestampType
import org.apache.spark.sql.functions.{col, lit, udf, input_file_name, unix_timestamp, date_format}

def col_builder(d: String, p:Int, l:String = " +"):String = Try {
     val myArray = d.split(l).toSeq

## upload_s3_retention_policy.py
import boto3
import pprint

## to be used if you're starting up a long running job that will constantly write
## to s3, and you want to delete after n number of days no matter what. Think
## clearing out logs, deleting old versions of data sets, etc.


# set your variables
rule_id_string='deleteCloudTrailAfter30Days'

## airflow-xcom-conditional-logic-dag.py
from airflow.models import DAG
from airflow.utils.dates import days_ago
from airflow.operators.python_operator import PythonOperator
from jinja2 import Template, Environment, FileSystemLoader

dag_id='nogit-arnon-exceptions'

docs = """
Trigger with:
{

## Dockerfile-pyspark-python39-boto-elastic-container-service
FROM python:3.9
WORKDIR /usr/src/app


ENV SPARK_HOME=/usr/local/lib/python3.9/site-packages/pyspark
RUN mkdir -p ~/.aws
RUN mkdir -p "${SPARK_HOME}/jars"
RUN /usr/local/bin/python -m pip install --upgrade pip
RUN apt update && apt-get install -y curl awscli vim libsnappy-dev openjdk-11-jdk mlocate
COPY ./requirements.txt requirements.txt

## delete_versions.py
# Say you are trying to delete a version controlled s3 bucket. You run all your
# s3 commands and the bucket looks empty but it's not. You have all
# the hidden/previous versions up there. And, non-empty buckets can't be deleted.
#
# You click "show versions" in the console, and wow it's a ton of stuff.
# S3 deletes one file at a time, and there's more than 24 hours worth of
# individual delete commands to delete all those versions. But you try anyway.
# And then your MFA token times out within 24 hours all the deletes are rolled back.

# Terminal probably isn't on MFA, so you're probably goign to have to fire off

## switch.sh
#!/bin/bash
# This assumes you want to use symlinks for ~/.aws/config and ~/.aws/credentials.
# Symlinks work just fine with IDE's, aws clients, etc. In my experience this is simpler than
# always using profiles, plus you get to name/isolate credentials files.

usage() {
  echo "$0 -w (to work) -p (to personal)"
  exit 1
}
	import org.apache.spark.sql.{DataFrame, SparkSession}
	import org.apache.spark.sql.functions._

	object TestDataFrameBuilder extends {

	// **************************
	// val names =
	// """tony,schmaser
	// \|fred,smith
	// \|reed,jerry""".stripMargin
	#!/bin/bash
	# EMR in targeted odcrs (Targeted On Demand Capacity Reservation)
	# THIS IS NOT A RUNNABLE SCRIPT -> IT WAS DESIGNED FOR CUTTING AND PASTING FOR DEMO


	# Do this: https://docs.aws.amazon.com/emr/latest/ManagementGuide/on-demand-capacity-reservations.html
	# Youtbue Demo: https://www.youtube.com/watch?v=WYWSFb5wZuo


	mnode="r4.xlarge" # master node
	f = [66,67,67,68,68,68,68,69,69,69,69,70,70,71,71,72,73,75]
	from numpy import percentile
	def five_number_summary(data):
	quartiles = percentile(data, [25, 50, 75])
	print(data)
	print('Min: %.3f' % min(data))
	print('Q1: %.3f' % quartiles[0])
	print('Median: %.3f' % quartiles[1])
	print('Q3: %.3f' % quartiles[2])
	print('Max: %.3f' % max(data))
	import pandas as pd
	df = pd.read_csv('./nogit_dataset.csv')
	df.head(5)

	# StringDate Product Store Value
	#0 1012018 2667437 QLD_CW_ST0203 2926.000
	#1 2012018 2667437 QLD_CW_ST0203 2687.531
	#2 3012018 2667437 QLD_CW_ST0203 2793.000
	#3 4012018 2667437 QLD_CW_ST0203 2394.000
	#4 5012018 2667437 QLD_CW_ST0203 2660.000
	## if you have a bunch of `aws s3 ls > $date.txt"`` files in a directory,
	## you can load them into a dataframe to look at them. Of course you can use the hadoop api instead,
	## but this is quick and dirty and works if you're trying to troubleshoot to see if a feed is working.

	import scala.util.Try
	import org.apache.spark.sql.types.TimestampType
	import org.apache.spark.sql.functions.{col, lit, udf, input_file_name, unix_timestamp, date_format}

	def col_builder(d: String, p:Int, l:String = " +"):String = Try {
	val myArray = d.split(l).toSeq
	import boto3
	import pprint

	## to be used if you're starting up a long running job that will constantly write
	## to s3, and you want to delete after n number of days no matter what. Think
	## clearing out logs, deleting old versions of data sets, etc.


	# set your variables
	rule_id_string='deleteCloudTrailAfter30Days'
	from airflow.models import DAG
	from airflow.utils.dates import days_ago
	from airflow.operators.python_operator import PythonOperator
	from jinja2 import Template, Environment, FileSystemLoader

	dag_id='nogit-arnon-exceptions'

	docs = """
	Trigger with:
	{
	FROM python:3.9
	WORKDIR /usr/src/app


	ENV SPARK_HOME=/usr/local/lib/python3.9/site-packages/pyspark
	RUN mkdir -p ~/.aws
	RUN mkdir -p "${SPARK_HOME}/jars"
	RUN /usr/local/bin/python -m pip install --upgrade pip
	RUN apt update && apt-get install -y curl awscli vim libsnappy-dev openjdk-11-jdk mlocate
	COPY ./requirements.txt requirements.txt
	# Say you are trying to delete a version controlled s3 bucket. You run all your
	# s3 commands and the bucket looks empty but it's not. You have all
	# the hidden/previous versions up there. And, non-empty buckets can't be deleted.
	#
	# You click "show versions" in the console, and wow it's a ton of stuff.
	# S3 deletes one file at a time, and there's more than 24 hours worth of
	# individual delete commands to delete all those versions. But you try anyway.
	# And then your MFA token times out within 24 hours all the deletes are rolled back.

	# Terminal probably isn't on MFA, so you're probably goign to have to fire off
	#!/bin/bash
	# This assumes you want to use symlinks for ~/.aws/config and ~/.aws/credentials.
	# Symlinks work just fine with IDE's, aws clients, etc. In my experience this is simpler than
	# always using profiles, plus you get to name/isolate credentials files.

	usage() {
	echo "$0 -w (to work) -p (to personal)"
	exit 1
	}