This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spell.client | |
client = spell.client.from_environment() | |
train = client.runs.new( | |
machine_type="cpu", | |
# IMPORTANT: point out that you can specify a branch as well. | |
github_url="https://github.com/ResidentMario/spell-batch.git", | |
pip_packages=["pandas", "scikit-learn"], | |
attached_resources={ | |
"s3://spell-datasets-share/wta-matches/": "/mnt/wta-matches/" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import argparse | |
from joblib import load | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--filename', type=str, dest='filename', help='path to the dataset to be scored') | |
args = parser.parse_args() | |
if __name__ == "__main__": | |
from distributed import Client, LocalCluster |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM ubuntu:18.04 | |
WORKDIR /spell | |
RUN apt-get update && \ | |
apt-get install -y wget git && rm -rf /var/lib/apt/lists/* | |
ENV CONDA_HOME=/root/anaconda/ | |
RUN wget \ | |
https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh \ | |
&& mkdir /root/.conda \ | |
&& bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -fbp $CONDA_HOME \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.linear_model import LogisticRegression | |
from joblib import dump | |
matches = pd.read_csv("/mnt/wta-matches/wta_matches_2015.csv") | |
point_diff = (matches.winner_rank_points - matches.loser_rank_points).dropna() | |
X = point_diff.values[:, np.newaxis] | |
y = (point_diff > 0).values.astype(int).reshape(-1, 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# ARGUMENT PARSING | |
# | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--n-samples', type=int, dest='n_samples', help='number of samples in the dataset', default=5950) | |
parser.add_argument('--n-workers', type=int, dest='n_workers', help='number of workers to launch', default=1) | |
parser.add_argument('--threads-per-worker', type=int, dest='threads_per_worker', help='threads per worker', default=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python scripts/training/gpu/train_parallel_implementation.py \ | |
--n-workers 1 --threads-per-worker 1 \ | |
--dataset-size '10 GB' --worker-memory-limit '12 GB' \ | |
--chunk-size 5950000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--------------------------------------------------------------------------- | |
IndexError Traceback (most recent call last) | |
<ipython-input-22-4c19f3a1c827> in <module> | |
----> 1 import dask.dataframe as dd; dd.read_parquet("s3://2019-nyc-taxi-trips/*/data.parquet", storage_options={"key": "AKIAVKTT2PHGKDQIERXC", "secret": "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"}, engine="fastparquet") | |
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/dataframe/io/parquet/core.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, gather_statistics, split_row_groups, chunksize, **kwargs) | |
234 filters=filters, | |
235 split_row_groups=split_row_groups, | |
--> 236 **kwargs | |
237 ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM nvidia/cuda:10.1-base-ubuntu18.04 | |
WORKDIR /spell | |
RUN apt-get update && \ | |
apt-get install -y wget && rm -rf /var/lib/apt/lists/* | |
ENV CONDA_HOME=/root/anaconda/ | |
RUN wget \ | |
https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh \ | |
&& mkdir /root/.conda \ | |
&& bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -fbp $CONDA_HOME \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# https://gist.github.com/ResidentMario/9f41ac480f9efbf2ff1d05d450c29470 | |
set -ex | |
if [[ -z "$EC2_SSH_KEY_NAME" ]]; then | |
echo "EC2_SSH_KEY_NAME environment variable not set, exiting." && exit 1 | |
fi | |
if [[ -z "$EC2_SSH_KEY_FILEPATH" ]]; then | |
echo "EC2_SSH_KEY_FILEPATH environment variable not set, exiting." && exit 1 | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
EMR_CLUSTER_ID=$(aws emr list-clusters \ | |
--output json \ | |
| jq -r '.Clusters[0].Id') | |
EMR_INSTANCE_ID=$(aws emr describe-cluster \ | |
--output json \ | |
--cluster-id $EMR_CLUSTER_ID | \ | |
jq -r '.Cluster.InstanceGroups | map(select(.Name = "MASTER"))[0].Id') | |
EC2_INSTANCE_ID=$(aws emr list-instances \ |