Skip to content

Instantly share code, notes, and snippets.

@nsakki55
Created May 4, 2022 01:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nsakki55/d789542d42a6b980f290fcfcf1d38b97 to your computer and use it in GitHub Desktop.
Save nsakki55/d789542d42a6b980f290fcfcf1d38b97 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "16f0764c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/satsuki/.pyenv/versions/3.7.10/envs/data-science/lib/python3.7/site-packages/pandas/compat/__init__.py:124: UserWarning: Could not import the lzma module. Your installed Python is incomplete. Attempting to use lzma compression will result in a RuntimeError.\n",
" warnings.warn(msg)\n"
]
}
],
"source": [
"import pandas as pd\n",
"import yaml\n",
"import boto3\n",
"import sagemaker\n",
"from sagemaker.sklearn.processing import SKLearnProcessor\n",
"from sagemaker.processing import ProcessingInput, ProcessingOutput"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "60a27945",
"metadata": {},
"outputs": [],
"source": [
"SETTING_FILE_PATH = '../settings.yaml'\n",
"with open(SETTING_FILE_PATH) as file:\n",
" aws_info = yaml.safe_load(file)\n",
" \n",
"role = aws_info['aws']['sagemaker']['role']\n",
"s3bucket = aws_info['aws']['sagemaker']['s3bucket']\n",
"sm = boto3.client('sagemaker')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c1047238",
"metadata": {},
"outputs": [],
"source": [
"processing_instance_type = \"ml.t3.medium\"\n",
"processing_instance_count = 1\n",
"train_valid_split_percentage = 0.8\n",
"input_data_s3_uri = \"s3://{}/input/\".format(s3bucket)\n",
"output_data_s3_uri = \"s3://{}/output/\".format(s3bucket)\n",
"processing_job_name = \"ctr-prediction-sklearn-preprocessor\"\n",
"\n",
"processor = SKLearnProcessor(\n",
" framework_version=\"0.23-1\",\n",
" role=role,\n",
" instance_type=processing_instance_type,\n",
" instance_count=processing_instance_count,\n",
" max_runtime_in_seconds=7200,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5744382b",
"metadata": {},
"outputs": [],
"source": [
"res = processor.run(\n",
" code=\"sklearn-processor.py\",\n",
" inputs=[ProcessingInput(\n",
" source=input_data_s3_uri,\n",
" destination=\"/opt/ml/processing/input\"),\n",
" ],\n",
" outputs=[\n",
" ProcessingOutput(\n",
" source=\"/opt/ml/processing/output/train\",\n",
" destination=output_data_s3_uri),\n",
" ProcessingOutput(\n",
" source=\"/opt/ml/processing/output/validation\",\n",
" destination=output_data_s3_uri),\n",
" ],\n",
" arguments=[\n",
" \"--train_valid_split_percentage\",\n",
" str(train_valid_split_percentage)],\n",
" wait=True,\n",
" logs=False,\n",
" job_name=processing_job_name,\n",
" experiment_config=None \n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "328fbd3b",
"metadata": {},
"outputs": [],
"source": [
"jobs = sm.list_processing_jobs()\n",
"pd.DataFrame(jobs['ProcessingJobSummaries'])[:1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c8148ea",
"metadata": {},
"outputs": [],
"source": [
"processor_description = processor.jobs[-1].describe()\n",
"processor_description"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment