Aleksey Bilogur ResidentMario

## workflow.py
import spell.client
client = spell.client.from_environment()

train = client.runs.new(
    machine_type="cpu",
    # IMPORTANT: point out that you can specify a branch as well.
    github_url="https://github.com/ResidentMario/spell-batch.git",
    pip_packages=["pandas", "scikit-learn"],
    attached_resources={
        "s3://spell-datasets-share/wta-matches/": "/mnt/wta-matches/"

## score.py
import numpy as np
import argparse
from joblib import load

parser = argparse.ArgumentParser()
parser.add_argument('--filename', type=str, dest='filename', help='path to the dataset to be scored')
args = parser.parse_args()

if __name__ == "__main__":
    from distributed import Client, LocalCluster

## Dockerfile
FROM ubuntu:18.04
WORKDIR /spell

RUN apt-get update && \
    apt-get install -y wget git && rm -rf /var/lib/apt/lists/*
ENV CONDA_HOME=/root/anaconda/
RUN wget \
    https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh \
    && mkdir /root/.conda \
    && bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -fbp $CONDA_HOME \

## train.py
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from joblib import dump

matches = pd.read_csv("/mnt/wta-matches/wta_matches_2015.csv")
point_diff = (matches.winner_rank_points - matches.loser_rank_points).dropna()
X = point_diff.values[:, np.newaxis]
y = (point_diff > 0).values.astype(int).reshape(-1, 1)

## cpu_train_parallel.py
#
# ARGUMENT PARSING
#

import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--n-samples', type=int, dest='n_samples', help='number of samples in the dataset', default=5950)
parser.add_argument('--n-workers', type=int, dest='n_workers', help='number of workers to launch', default=1)
parser.add_argument('--threads-per-worker', type=int, dest='threads_per_worker', help='threads per worker', default=4)

## run.sh
python scripts/training/gpu/train_parallel_implementation.py \
    --n-workers 1 --threads-per-worker 1 \
    --dataset-size '10 GB' --worker-memory-limit '12 GB' \
    --chunk-size 5950000

## gist:f6cd4e53c369b9d0de5a8b1eb6087067
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-22-4c19f3a1c827> in <module>
----> 1 import dask.dataframe as dd; dd.read_parquet("s3://2019-nyc-taxi-trips/*/data.parquet", storage_options={"key": "AKIAVKTT2PHGKDQIERXC", "secret": "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"}, engine="fastparquet")

~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/dataframe/io/parquet/core.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, gather_statistics, split_row_groups, chunksize, **kwargs)
    234         filters=filters,
    235         split_row_groups=split_row_groups,
--> 236         **kwargs
    237     )

## Dockerfile
FROM nvidia/cuda:10.1-base-ubuntu18.04
WORKDIR /spell

RUN apt-get update && \
    apt-get install -y wget && rm -rf /var/lib/apt/lists/*
ENV CONDA_HOME=/root/anaconda/
RUN wget \
    https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh \
    && mkdir /root/.conda \
    && bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -fbp $CONDA_HOME \

## build_gpu_image_on_ec2.sh
#!/bin/bash
# https://gist.github.com/ResidentMario/9f41ac480f9efbf2ff1d05d450c29470
set -ex

if [[ -z "$EC2_SSH_KEY_NAME" ]]; then
    echo "EC2_SSH_KEY_NAME environment variable not set, exiting." && exit 1
fi
if [[ -z "$EC2_SSH_KEY_FILEPATH" ]]; then
    echo "EC2_SSH_KEY_FILEPATH environment variable not set, exiting." && exit 1
fi

## add_own_ip_sg_rule.sh
EMR_CLUSTER_ID=$(aws emr list-clusters \
	--output json \
	| jq -r '.Clusters[0].Id')

EMR_INSTANCE_ID=$(aws emr describe-cluster \
	--output json \
	--cluster-id $EMR_CLUSTER_ID | \
	jq -r '.Cluster.InstanceGroups | map(select(.Name = "MASTER"))[0].Id')

EC2_INSTANCE_ID=$(aws emr list-instances \
	import spell.client
	client = spell.client.from_environment()

	train = client.runs.new(
	machine_type="cpu",
	# IMPORTANT: point out that you can specify a branch as well.
	github_url="https://github.com/ResidentMario/spell-batch.git",
	pip_packages=["pandas", "scikit-learn"],
	attached_resources={
	"s3://spell-datasets-share/wta-matches/": "/mnt/wta-matches/"
	import numpy as np
	import argparse
	from joblib import load

	parser = argparse.ArgumentParser()
	parser.add_argument('--filename', type=str, dest='filename', help='path to the dataset to be scored')
	args = parser.parse_args()

	if __name__ == "__main__":
	from distributed import Client, LocalCluster
	FROM ubuntu:18.04
	WORKDIR /spell

	RUN apt-get update && \
	apt-get install -y wget git && rm -rf /var/lib/apt/lists/*
	ENV CONDA_HOME=/root/anaconda/
	RUN wget \
	https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh \
	&& mkdir /root/.conda \
	&& bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -fbp $CONDA_HOME \
	import pandas as pd
	import numpy as np
	from sklearn.linear_model import LogisticRegression
	from joblib import dump

	matches = pd.read_csv("/mnt/wta-matches/wta_matches_2015.csv")
	point_diff = (matches.winner_rank_points - matches.loser_rank_points).dropna()
	X = point_diff.values[:, np.newaxis]
	y = (point_diff > 0).values.astype(int).reshape(-1, 1)
	#
	# ARGUMENT PARSING
	#

	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument('--n-samples', type=int, dest='n_samples', help='number of samples in the dataset', default=5950)
	parser.add_argument('--n-workers', type=int, dest='n_workers', help='number of workers to launch', default=1)
	parser.add_argument('--threads-per-worker', type=int, dest='threads_per_worker', help='threads per worker', default=4)
	python scripts/training/gpu/train_parallel_implementation.py \
	--n-workers 1 --threads-per-worker 1 \
	--dataset-size '10 GB' --worker-memory-limit '12 GB' \
	--chunk-size 5950000
	---------------------------------------------------------------------------
	IndexError Traceback (most recent call last)
	<ipython-input-22-4c19f3a1c827> in <module>
	----> 1 import dask.dataframe as dd; dd.read_parquet("s3://2019-nyc-taxi-trips/*/data.parquet", storage_options={"key": "AKIAVKTT2PHGKDQIERXC", "secret": "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"}, engine="fastparquet")

	~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/dataframe/io/parquet/core.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, gather_statistics, split_row_groups, chunksize, **kwargs)
	234 filters=filters,
	235 split_row_groups=split_row_groups,
	--> 236 **kwargs
	237 )
	FROM nvidia/cuda:10.1-base-ubuntu18.04
	WORKDIR /spell

	RUN apt-get update && \
	apt-get install -y wget && rm -rf /var/lib/apt/lists/*
	ENV CONDA_HOME=/root/anaconda/
	RUN wget \
	https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh \
	&& mkdir /root/.conda \
	&& bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -fbp $CONDA_HOME \
	#!/bin/bash
	# https://gist.github.com/ResidentMario/9f41ac480f9efbf2ff1d05d450c29470
	set -ex

	if [[ -z "$EC2_SSH_KEY_NAME" ]]; then
	echo "EC2_SSH_KEY_NAME environment variable not set, exiting." && exit 1
	fi
	if [[ -z "$EC2_SSH_KEY_FILEPATH" ]]; then
	echo "EC2_SSH_KEY_FILEPATH environment variable not set, exiting." && exit 1
	fi
	EMR_CLUSTER_ID=$(aws emr list-clusters \
	--output json \
	\| jq -r '.Clusters[0].Id')

	EMR_INSTANCE_ID=$(aws emr describe-cluster \
	--output json \
	--cluster-id $EMR_CLUSTER_ID \| \
	jq -r '.Cluster.InstanceGroups \| map(select(.Name = "MASTER"))[0].Id')

	EC2_INSTANCE_ID=$(aws emr list-instances \