Skip to content

Instantly share code, notes, and snippets.

@caleb-kaiser
caleb-kaiser / initialize.txt
Last active October 25, 2019 15:42
Initialize your environment
git init
dvc get \
https://github.com/iterative/dataset-registry \ tutorial/nlp/pipeline.zip
unzip pipeline.zip
rm pipeline.zip/
virtualenv -p python3 .env
source .env/bin/activate
pip3 install -r code/requirements.txt
echo -e "\n.env/" >> .gitignore
dvc init
@caleb-kaiser
caleb-kaiser / ExtractAndTransform.txt
Created October 25, 2019 15:48
Extract data and transform XML to TSV
dvc run \
-f extract.dvc \
-d data/Posts.xml.zip \
-o data/Posts.xml \
'unzip data/Posts.xml.zip -d data'
dvc run \
-f prepare.dvc \
-d code/xml_to_tsv.py \
-d data/Posts.xml \
@caleb-kaiser
caleb-kaiser / SplitAndFeaturize
Created October 25, 2019 15:52
Split test and featureize
dvc run \
-f split.dvc \
-d code/split_train_test.py \
-d data/Posts.tsv \
-o data/Posts-train.tsv \
-o data/Posts-test.tsv \
python \
code/split_train_test.py \
data/Posts.tsv \
0.2 \
@caleb-kaiser
caleb-kaiser / TrainAndEvaluate.txt
Created October 25, 2019 15:53
Train model and evaluate
dvc run \
-f train.dvc \
-d code/train_model.py \
-d data/matrix-train.pkl \
-o data/model.pkl \
python \
code/train_model.py \
data/matrix-train.pkl \
20191001 \
data/model.pkl
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import sys
import re
import boto3
AWS_ACCESS_KEY_ID = "" #@param {type:"string"}
AWS_SECRET_ACCESS_KEY = "" #@param {type:"string"}
S3_UPLOAD_PATH = "s3://your/bucket" #@param {type:"string"}
- kind: deployment
name: text
- kind: api
name: classify
model: s3://path/to/model.onnx
md5: 4fb200acee6d3baf4a5973991a1c1c28
cmd: python code/evaluate.py data/model.pkl data/matrix-test.pkl auc.metric
deps:
- md5: 6570db179732195bce53950e47ef0d83
path: code/evaluate.py
- md5: b4f44c9a41d9ddeb977bc00ae32796be
path: data/model.pkl
- md5: 8d631caa6317bc980d5b30f1724ec639
path: data/matrix-test.pkl
outs:
md5: 4fb200acee6d3baf4a5973991a1c1c28
cmd: python code/evaluate.py data/model.pkl data/matrix-test.pkl auc.metric
deps:
- md5: 6570db179732195bce53950e47ef0d83
path: code/evaluate.py
- md5: b4f44c9a41d9ddeb977bc00ae32796be
path: data/model.pkl
- md5: 8d631caa6317bc980d5b30f1724ec639
path: data/matrix-test.pkl
outs:
md5: 5162da3999c35883752005b43ef920ac
cmd: python code/train_model.py data/matrix-train.pkl 20191001 data/model.pkl
deps:
- md5: 5c47c8bc5ee1ff8a5a81f654a1d7a167
path: code/train_model.py
- md5: 5bd2d39b9719a77107f5172d68bef5c2
path: data/matrix-train.pkl
outs:
- md5: b4f44c9a41d9ddeb977bc00ae32796be
path: data/model.pkl
from sklearn.ensemble import RandomForestClassifier
import sys
import re
import boto3
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
try:
import cPickle as pickle # python2
except ModuleNotFoundError: