Daniel Chalef danielchalef

## nv_tune.sh
#!/bin/bash
#
# For Nvidia GPUs only
# Overclocks, underpowers, and sets fan speeds
#
# Requires bash, a compatible proprietary nvidia driver (tested with 375+), and a suitable xorg.conf
# NOTE:
# - Ensure that your Xorg configuration has coolbits configured to allow manipulation of fan and clocks. Create
#   a suitable xorg.conf using the following command:
#   sudo nvidia-xconfig -a --allow-empty-initial-configuration --cool-bits=28

## xorg.conf
# nvidia-xconfig: X configuration file generated by nvidia-xconfig
# nvidia-xconfig:  version 381.09  (buildmeister@swio-display-x86-rhel47-03)  Thu Mar 30 18:21:34 PDT 2017


Section "ServerLayout"
    Identifier     "Layout0"
    Screen      0  "Screen0"
    Screen      1  "Screen1" RightOf "Screen0"
    Screen      2  "Screen2" RightOf "Screen1"
    Screen      3  "Screen3" RightOf "Screen2"

## sagemaker-spark-ntm.scala
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.types._
import com.amazonaws.services.sagemaker.sparksdk.IAMRole
import com.amazonaws.services.sagemaker.sparksdk.SageMakerEstimator
import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.ProtobufResponseRowDeserializer
import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer

val iam_role = "arn:aws:iam::XXXXX"
val container = "174872318107.dkr.ecr.us-west-2.amazonaws.com/ntm:1"

## spark-df-to-recordio-protobuf.scala
import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer
import org.apache.spark.sql.functions._

val df_indexed = corpus.withColumn("id",monotonicallyIncreasingId.cast(DoubleType))

df_indexed.write.format("sagemaker")
    .option("labelColumnName", "id")
    .option("featuresColumnName", "features")
    .save("s3a://XXXXXXXXXXXXXXXXX")

## python-spark-s3a-issue-resolution.py
import os
import configparser

config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))

aws_profile = 'default' # your AWS profile to use
aws_region = "us-west-2"

access_id = config.get(aws_profile, "aws_access_key_id")

## wide_to_long.py
# PySpark equivalent to `pandas.wide_to_long()`
# Credit for melt(): https://stackoverflow.com/questions/41670103/how-to-melt-spark-dataframe
#
# Usage: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.wide_to_long.html
# Note: this was a quick hack and some error checking in the original Pandas version has been stripped out.

import re
from typing import Iterable, List
from pyspark.sql import DataFrame

## airflow_dev_cleanup.sh
#!/bin/bash

if [ -z "$AIRFLOW_HOME" ]
then
        echo "AIRFLOW_HOME is not set"
        exit 1
fi

if [ -f "$AIRFLOW_HOME/airflow-webserver.pid" ]
then

## spark_vector_col_to_scipy_sparse.py
def to_sparse(row: pd.Series, col_name) -> csr_matrix:
    """ Parse each row of the constituent columns of a Spark SparseVector and
        return this row as a scipy.csr_matrix
    """
    values = np.array(row[f"{col_name}.values"]).astype(np.float32)
    row_indices = np.zeros(values.shape[0], dtype=np.int8)
    col_indices = np.array(row[f"{col_name}.indices"])
    shape = (1, row[f"{col_name}.size"])
    return coo_matrix((values, (row_indices, col_indices)), shape=shape).tocsr()

## bisectRight.scala
import scala.annotation.tailrec

object bisect {

  /**
  * Port of Python's bisect_right implemented using tail recursion.
  *
  * Return the index where to insert item x in Array A, assuming A is sorted.
  * The return value is such that all e in a[:i] have e <= x, and all e in
  * a[i:] have e > x.  So if x already appears in the list, a.insert(x) will
	#!/bin/bash
	#
	# For Nvidia GPUs only
	# Overclocks, underpowers, and sets fan speeds
	#
	# Requires bash, a compatible proprietary nvidia driver (tested with 375+), and a suitable xorg.conf
	# NOTE:
	# - Ensure that your Xorg configuration has coolbits configured to allow manipulation of fan and clocks. Create
	# a suitable xorg.conf using the following command:
	# sudo nvidia-xconfig -a --allow-empty-initial-configuration --cool-bits=28
	# nvidia-xconfig: X configuration file generated by nvidia-xconfig
	# nvidia-xconfig: version 381.09 (buildmeister@swio-display-x86-rhel47-03) Thu Mar 30 18:21:34 PDT 2017


	Section "ServerLayout"
	Identifier "Layout0"
	Screen 0 "Screen0"
	Screen 1 "Screen1" RightOf "Screen0"
	Screen 2 "Screen2" RightOf "Screen1"
	Screen 3 "Screen3" RightOf "Screen2"
	import org.apache.spark.sql.SparkSession
	import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
	import org.apache.spark.sql.types._
	import com.amazonaws.services.sagemaker.sparksdk.IAMRole
	import com.amazonaws.services.sagemaker.sparksdk.SageMakerEstimator
	import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.ProtobufResponseRowDeserializer
	import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer

	val iam_role = "arn:aws:iam::XXXXX"
	val container = "174872318107.dkr.ecr.us-west-2.amazonaws.com/ntm:1"
	import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer
	import org.apache.spark.sql.functions._

	val df_indexed = corpus.withColumn("id",monotonicallyIncreasingId.cast(DoubleType))

	df_indexed.write.format("sagemaker")
	.option("labelColumnName", "id")
	.option("featuresColumnName", "features")
	.save("s3a://XXXXXXXXXXXXXXXXX")
	import os
	import configparser

	config = configparser.ConfigParser()
	config.read(os.path.expanduser("~/.aws/credentials"))

	aws_profile = 'default' # your AWS profile to use
	aws_region = "us-west-2"

	access_id = config.get(aws_profile, "aws_access_key_id")
	# PySpark equivalent to `pandas.wide_to_long()`
	# Credit for melt(): https://stackoverflow.com/questions/41670103/how-to-melt-spark-dataframe
	#
	# Usage: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.wide_to_long.html
	# Note: this was a quick hack and some error checking in the original Pandas version has been stripped out.

	import re
	from typing import Iterable, List
	from pyspark.sql import DataFrame
	#!/bin/bash

	if [ -z "$AIRFLOW_HOME" ]
	then
	echo "AIRFLOW_HOME is not set"
	exit 1
	fi

	if [ -f "$AIRFLOW_HOME/airflow-webserver.pid" ]
	then
	def to_sparse(row: pd.Series, col_name) -> csr_matrix:
	""" Parse each row of the constituent columns of a Spark SparseVector and
	return this row as a scipy.csr_matrix
	"""
	values = np.array(row[f"{col_name}.values"]).astype(np.float32)
	row_indices = np.zeros(values.shape[0], dtype=np.int8)
	col_indices = np.array(row[f"{col_name}.indices"])
	shape = (1, row[f"{col_name}.size"])
	return coo_matrix((values, (row_indices, col_indices)), shape=shape).tocsr()
	import scala.annotation.tailrec

	object bisect {

	/**
	* Port of Python's bisect_right implemented using tail recursion.
	*
	* Return the index where to insert item x in Array A, assuming A is sorted.
	* The return value is such that all e in a[:i] have e <= x, and all e in
	* a[i:] have e > x. So if x already appears in the list, a.insert(x) will