Skip to content

Instantly share code, notes, and snippets.

View ResidentMario's full-sized avatar

Aleksey Bilogur ResidentMario

View GitHub Profile
import spell.client
client = spell.client.from_environment()
train = client.runs.new(
machine_type="cpu",
# IMPORTANT: point out that you can specify a branch as well.
github_url="https://github.com/ResidentMario/spell-batch.git",
pip_packages=["pandas", "scikit-learn"],
attached_resources={
"s3://spell-datasets-share/wta-matches/": "/mnt/wta-matches/"
import numpy as np
import argparse
from joblib import load
parser = argparse.ArgumentParser()
parser.add_argument('--filename', type=str, dest='filename', help='path to the dataset to be scored')
args = parser.parse_args()
if __name__ == "__main__":
from distributed import Client, LocalCluster
FROM ubuntu:18.04
WORKDIR /spell
RUN apt-get update && \
apt-get install -y wget git && rm -rf /var/lib/apt/lists/*
ENV CONDA_HOME=/root/anaconda/
RUN wget \
https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh \
&& mkdir /root/.conda \
&& bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -fbp $CONDA_HOME \
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from joblib import dump
matches = pd.read_csv("/mnt/wta-matches/wta_matches_2015.csv")
point_diff = (matches.winner_rank_points - matches.loser_rank_points).dropna()
X = point_diff.values[:, np.newaxis]
y = (point_diff > 0).values.astype(int).reshape(-1, 1)
#
# ARGUMENT PARSING
#
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--n-samples', type=int, dest='n_samples', help='number of samples in the dataset', default=5950)
parser.add_argument('--n-workers', type=int, dest='n_workers', help='number of workers to launch', default=1)
parser.add_argument('--threads-per-worker', type=int, dest='threads_per_worker', help='threads per worker', default=4)
python scripts/training/gpu/train_parallel_implementation.py \
--n-workers 1 --threads-per-worker 1 \
--dataset-size '10 GB' --worker-memory-limit '12 GB' \
--chunk-size 5950000
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-22-4c19f3a1c827> in <module>
----> 1 import dask.dataframe as dd; dd.read_parquet("s3://2019-nyc-taxi-trips/*/data.parquet", storage_options={"key": "AKIAVKTT2PHGKDQIERXC", "secret": "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"}, engine="fastparquet")
~/opt/miniconda3/envs/dask-local-test-env/lib/python3.7/site-packages/dask/dataframe/io/parquet/core.py in read_parquet(path, columns, filters, categories, index, storage_options, engine, gather_statistics, split_row_groups, chunksize, **kwargs)
234 filters=filters,
235 split_row_groups=split_row_groups,
--> 236 **kwargs
237 )
FROM nvidia/cuda:10.1-base-ubuntu18.04
WORKDIR /spell
RUN apt-get update && \
apt-get install -y wget && rm -rf /var/lib/apt/lists/*
ENV CONDA_HOME=/root/anaconda/
RUN wget \
https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh \
&& mkdir /root/.conda \
&& bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -fbp $CONDA_HOME \
@ResidentMario
ResidentMario / build_gpu_image_on_ec2.sh
Last active February 13, 2024 13:55
EC2 GPU Image Builder Script
#!/bin/bash
# https://gist.github.com/ResidentMario/9f41ac480f9efbf2ff1d05d450c29470
set -ex
if [[ -z "$EC2_SSH_KEY_NAME" ]]; then
echo "EC2_SSH_KEY_NAME environment variable not set, exiting." && exit 1
fi
if [[ -z "$EC2_SSH_KEY_FILEPATH" ]]; then
echo "EC2_SSH_KEY_FILEPATH environment variable not set, exiting." && exit 1
fi
EMR_CLUSTER_ID=$(aws emr list-clusters \
--output json \
| jq -r '.Clusters[0].Id')
EMR_INSTANCE_ID=$(aws emr describe-cluster \
--output json \
--cluster-id $EMR_CLUSTER_ID | \
jq -r '.Cluster.InstanceGroups | map(select(.Name = "MASTER"))[0].Id')
EC2_INSTANCE_ID=$(aws emr list-instances \