Daniel Chalef danielchalef

## bisectRight.scala
import scala.annotation.tailrec

object bisect {

  /**
  * Port of Python's bisect_right implemented using tail recursion.
  *
  * Return the index where to insert item x in Array A, assuming A is sorted.
  * The return value is such that all e in a[:i] have e <= x, and all e in
  * a[i:] have e > x.  So if x already appears in the list, a.insert(x) will

## spark_vector_col_to_scipy_sparse.py
def to_sparse(row: pd.Series, col_name) -> csr_matrix:
    """ Parse each row of the constituent columns of a Spark SparseVector and
        return this row as a scipy.csr_matrix
    """
    values = np.array(row[f"{col_name}.values"]).astype(np.float32)
    row_indices = np.zeros(values.shape[0], dtype=np.int8)
    col_indices = np.array(row[f"{col_name}.indices"])
    shape = (1, row[f"{col_name}.size"])
    return coo_matrix((values, (row_indices, col_indices)), shape=shape).tocsr()

## airflow_dev_cleanup.sh
#!/bin/bash

if [ -z "$AIRFLOW_HOME" ]
then
        echo "AIRFLOW_HOME is not set"
        exit 1
fi

if [ -f "$AIRFLOW_HOME/airflow-webserver.pid" ]
then

## wide_to_long.py
# PySpark equivalent to `pandas.wide_to_long()`
# Credit for melt(): https://stackoverflow.com/questions/41670103/how-to-melt-spark-dataframe
#
# Usage: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.wide_to_long.html
# Note: this was a quick hack and some error checking in the original Pandas version has been stripped out.

import re
from typing import Iterable, List
from pyspark.sql import DataFrame

## python-spark-s3a-issue-resolution.py
import os
import configparser

config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))

aws_profile = 'default' # your AWS profile to use
aws_region = "us-west-2"

access_id = config.get(aws_profile, "aws_access_key_id")

## spark-df-to-recordio-protobuf.scala
import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer
import org.apache.spark.sql.functions._

val df_indexed = corpus.withColumn("id",monotonicallyIncreasingId.cast(DoubleType))

df_indexed.write.format("sagemaker")
    .option("labelColumnName", "id")
    .option("featuresColumnName", "features")
    .save("s3a://XXXXXXXXXXXXXXXXX")

## sagemaker-spark-ntm.scala
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.types._
import com.amazonaws.services.sagemaker.sparksdk.IAMRole
import com.amazonaws.services.sagemaker.sparksdk.SageMakerEstimator
import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.ProtobufResponseRowDeserializer
import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer

val iam_role = "arn:aws:iam::XXXXX"
val container = "174872318107.dkr.ecr.us-west-2.amazonaws.com/ntm:1"

## xorg.conf
# nvidia-xconfig: X configuration file generated by nvidia-xconfig
# nvidia-xconfig:  version 381.09  (buildmeister@swio-display-x86-rhel47-03)  Thu Mar 30 18:21:34 PDT 2017


Section "ServerLayout"
    Identifier     "Layout0"
    Screen      0  "Screen0"
    Screen      1  "Screen1" RightOf "Screen0"
    Screen      2  "Screen2" RightOf "Screen1"
    Screen      3  "Screen3" RightOf "Screen2"

## nv_tune.sh
#!/bin/bash
#
# For Nvidia GPUs only
# Overclocks, underpowers, and sets fan speeds
#
# Requires bash, a compatible proprietary nvidia driver (tested with 375+), and a suitable xorg.conf
# NOTE:
# - Ensure that your Xorg configuration has coolbits configured to allow manipulation of fan and clocks. Create
#   a suitable xorg.conf using the following command:
#   sudo nvidia-xconfig -a --allow-empty-initial-configuration --cool-bits=28
	import scala.annotation.tailrec

	object bisect {

	/**
	* Port of Python's bisect_right implemented using tail recursion.
	*
	* Return the index where to insert item x in Array A, assuming A is sorted.
	* The return value is such that all e in a[:i] have e <= x, and all e in
	* a[i:] have e > x. So if x already appears in the list, a.insert(x) will
	def to_sparse(row: pd.Series, col_name) -> csr_matrix:
	""" Parse each row of the constituent columns of a Spark SparseVector and
	return this row as a scipy.csr_matrix
	"""
	values = np.array(row[f"{col_name}.values"]).astype(np.float32)
	row_indices = np.zeros(values.shape[0], dtype=np.int8)
	col_indices = np.array(row[f"{col_name}.indices"])
	shape = (1, row[f"{col_name}.size"])
	return coo_matrix((values, (row_indices, col_indices)), shape=shape).tocsr()
	#!/bin/bash

	if [ -z "$AIRFLOW_HOME" ]
	then
	echo "AIRFLOW_HOME is not set"
	exit 1
	fi

	if [ -f "$AIRFLOW_HOME/airflow-webserver.pid" ]
	then
	# PySpark equivalent to `pandas.wide_to_long()`
	# Credit for melt(): https://stackoverflow.com/questions/41670103/how-to-melt-spark-dataframe
	#
	# Usage: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.wide_to_long.html
	# Note: this was a quick hack and some error checking in the original Pandas version has been stripped out.

	import re
	from typing import Iterable, List
	from pyspark.sql import DataFrame
	import os
	import configparser

	config = configparser.ConfigParser()
	config.read(os.path.expanduser("~/.aws/credentials"))

	aws_profile = 'default' # your AWS profile to use
	aws_region = "us-west-2"

	access_id = config.get(aws_profile, "aws_access_key_id")
	import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer
	import org.apache.spark.sql.functions._

	val df_indexed = corpus.withColumn("id",monotonicallyIncreasingId.cast(DoubleType))

	df_indexed.write.format("sagemaker")
	.option("labelColumnName", "id")
	.option("featuresColumnName", "features")
	.save("s3a://XXXXXXXXXXXXXXXXX")
	import org.apache.spark.sql.SparkSession
	import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
	import org.apache.spark.sql.types._
	import com.amazonaws.services.sagemaker.sparksdk.IAMRole
	import com.amazonaws.services.sagemaker.sparksdk.SageMakerEstimator
	import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.ProtobufResponseRowDeserializer
	import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer

	val iam_role = "arn:aws:iam::XXXXX"
	val container = "174872318107.dkr.ecr.us-west-2.amazonaws.com/ntm:1"
	# nvidia-xconfig: X configuration file generated by nvidia-xconfig
	# nvidia-xconfig: version 381.09 (buildmeister@swio-display-x86-rhel47-03) Thu Mar 30 18:21:34 PDT 2017


	Section "ServerLayout"
	Identifier "Layout0"
	Screen 0 "Screen0"
	Screen 1 "Screen1" RightOf "Screen0"
	Screen 2 "Screen2" RightOf "Screen1"
	Screen 3 "Screen3" RightOf "Screen2"
	#!/bin/bash
	#
	# For Nvidia GPUs only
	# Overclocks, underpowers, and sets fan speeds
	#
	# Requires bash, a compatible proprietary nvidia driver (tested with 375+), and a suitable xorg.conf
	# NOTE:
	# - Ensure that your Xorg configuration has coolbits configured to allow manipulation of fan and clocks. Create
	# a suitable xorg.conf using the following command:
	# sudo nvidia-xconfig -a --allow-empty-initial-configuration --cool-bits=28