Skip to content

Instantly share code, notes, and snippets.

View danielchalef's full-sized avatar

Daniel Chalef danielchalef

View GitHub Profile
import scala.annotation.tailrec
object bisect {
/**
* Port of Python's bisect_right implemented using tail recursion.
*
* Return the index where to insert item x in Array A, assuming A is sorted.
* The return value is such that all e in a[:i] have e <= x, and all e in
* a[i:] have e > x. So if x already appears in the list, a.insert(x) will
@danielchalef
danielchalef / spark_vector_col_to_scipy_sparse.py
Created December 24, 2019 04:06
Convert a Spark SparseVector column saved to parquet to a SciPy SparseMatrix
def to_sparse(row: pd.Series, col_name) -> csr_matrix:
""" Parse each row of the constituent columns of a Spark SparseVector and
return this row as a scipy.csr_matrix
"""
values = np.array(row[f"{col_name}.values"]).astype(np.float32)
row_indices = np.zeros(values.shape[0], dtype=np.int8)
col_indices = np.array(row[f"{col_name}.indices"])
shape = (1, row[f"{col_name}.size"])
return coo_matrix((values, (row_indices, col_indices)), shape=shape).tocsr()
@danielchalef
danielchalef / airflow_dev_cleanup.sh
Created October 12, 2019 17:30
Clean up airflow dev environment and restart webserver/scheduler
#!/bin/bash
if [ -z "$AIRFLOW_HOME" ]
then
echo "AIRFLOW_HOME is not set"
exit 1
fi
if [ -f "$AIRFLOW_HOME/airflow-webserver.pid" ]
then
@danielchalef
danielchalef / wide_to_long.py
Last active March 8, 2022 10:55
PySpark equivalent to pandas.wide_to_long()
# PySpark equivalent to `pandas.wide_to_long()`
# Credit for melt(): https://stackoverflow.com/questions/41670103/how-to-melt-spark-dataframe
#
# Usage: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.wide_to_long.html
# Note: this was a quick hack and some error checking in the original Pandas version has been stripped out.
import re
from typing import Iterable, List
from pyspark.sql import DataFrame
import os
import configparser
config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))
aws_profile = 'default' # your AWS profile to use
aws_region = "us-west-2"
access_id = config.get(aws_profile, "aws_access_key_id")
import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer
import org.apache.spark.sql.functions._
val df_indexed = corpus.withColumn("id",monotonicallyIncreasingId.cast(DoubleType))
df_indexed.write.format("sagemaker")
.option("labelColumnName", "id")
.option("featuresColumnName", "features")
.save("s3a://XXXXXXXXXXXXXXXXX")
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.types._
import com.amazonaws.services.sagemaker.sparksdk.IAMRole
import com.amazonaws.services.sagemaker.sparksdk.SageMakerEstimator
import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.ProtobufResponseRowDeserializer
import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.ProtobufRequestRowSerializer
val iam_role = "arn:aws:iam::XXXXX"
val container = "174872318107.dkr.ecr.us-west-2.amazonaws.com/ntm:1"
# nvidia-xconfig: X configuration file generated by nvidia-xconfig
# nvidia-xconfig: version 381.09 (buildmeister@swio-display-x86-rhel47-03) Thu Mar 30 18:21:34 PDT 2017
Section "ServerLayout"
Identifier "Layout0"
Screen 0 "Screen0"
Screen 1 "Screen1" RightOf "Screen0"
Screen 2 "Screen2" RightOf "Screen1"
Screen 3 "Screen3" RightOf "Screen2"
@danielchalef
danielchalef / nv_tune.sh
Last active May 11, 2017 21:50
Nvidia GPU Tuner
#!/bin/bash
#
# For Nvidia GPUs only
# Overclocks, underpowers, and sets fan speeds
#
# Requires bash, a compatible proprietary nvidia driver (tested with 375+), and a suitable xorg.conf
# NOTE:
# - Ensure that your Xorg configuration has coolbits configured to allow manipulation of fan and clocks. Create
# a suitable xorg.conf using the following command:
# sudo nvidia-xconfig -a --allow-empty-initial-configuration --cool-bits=28