Skip to content
Next Next commit
Add ProcessingStep for SageMaker Processing Job
  • Loading branch information
shunjd committed Jul 14, 2020
commit 6b4f5598fc248ba4d0483fa80582ae59c0387a6b
2 changes: 1 addition & 1 deletion src/stepfunctions/steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from stepfunctions.steps.states import Pass, Succeed, Fail, Wait, Choice, Parallel, Map, Task, Chain, Retry, Catch
from stepfunctions.steps.states import Graph, FrozenGraph
from stepfunctions.steps.sagemaker import TrainingStep, TransformStep, ModelStep, EndpointConfigStep, EndpointStep
from stepfunctions.steps.sagemaker import TrainingStep, TransformStep, ModelStep, EndpointConfigStep, EndpointStep, ProcessingStep
from stepfunctions.steps.compute import LambdaStep, BatchSubmitJobStep, GlueStartJobRunStep, EcsRunTaskStep
from stepfunctions.steps.service import DynamoDBGetItemStep, DynamoDBPutItemStep, DynamoDBUpdateItemStep, DynamoDBDeleteItemStep
from stepfunctions.steps.service import SnsPublishStep, SqsSendMessageStep
Expand Down
57 changes: 56 additions & 1 deletion src/stepfunctions/steps/sagemaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from stepfunctions.steps.fields import Field
from stepfunctions.steps.utils import tags_dict_to_kv_list

from sagemaker.workflow.airflow import training_config, transform_config, model_config, tuning_config
from sagemaker.workflow.airflow import training_config, transform_config, model_config, tuning_config, processing_config
from sagemaker.model import Model, FrameworkModel
from sagemaker.model_monitor import DataCaptureConfig

Expand Down Expand Up @@ -356,3 +356,58 @@ def __init__(self, state_id, tuner, job_name, data, wait_for_completion=True, ta
kwargs[Field.Parameters.value] = parameters

super(TuningStep, self).__init__(state_id, **kwargs)


class ProcessingStep(Task):

"""
Creates a Task State to execute a SageMaker Processing Job.
"""

def __init__(self, state_id, processor, job_name, inputs=None, outputs=None, experiment_config=None, container_arguments=None, container_entrypoint=None, kms_key_id=None, wait_for_completion=True, tags=None, **kwargs):
"""
Args:
state_id (str): State name whose length **must be** less than or equal to 128 unicode characters. State names **must be** unique within the scope of the whole state machine.
processor (sagemaker.processing.Processor): The processor for the processing step.
job_name (str or Placeholder): Specify a processing job name, this is required for the processing job to run. We recommend to use :py:class:`~stepfunctions.inputs.ExecutionInput` placeholder collection to pass the value dynamically in each execution.
inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for
the processing job. These must be provided as
:class:`~sagemaker.processing.ProcessingInput` objects (default: None).
outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for
the processing job. These can be specified as either path strings or
:class:`~sagemaker.processing.ProcessingOutput` objects (default: None).
experiment_config (dict, optional): Specify the experiment config for the processing. (Default: None)
container_arguments ([str]): The arguments for a container used to run a processing job.
container_entrypoint ([str]): The entrypoint for a container used to run a processing job.
kms_key_id (str): The AWS Key Management Service (AWS KMS) key that Amazon SageMaker
uses to encrypt the processing job output. KmsKeyId can be an ID of a KMS key,
ARN of a KMS key, alias of a KMS key, or alias of a KMS key.
The KmsKeyId is applied to all outputs.
wait_for_completion (bool, optional): Boolean value set to `True` if the Task state should wait for the processing job to complete before proceeding to the next step in the workflow. Set to `False` if the Task state should submit the processing job and proceed to the next step. (default: True)
tags (list[dict], optional): `List to tags <https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html>`_ to associate with the resource.
"""
if wait_for_completion:
kwargs[Field.Resource.value] = 'arn:aws:states:::sagemaker:createProcessingJob.sync'
else:
kwargs[Field.Resource.value] = 'arn:aws:states:::sagemaker:createProcessingJob'

if isinstance(job_name, str):
parameters = processing_config(processor=processor, inputs=inputs, outputs=outputs, container_arguments=container_arguments, container_entrypoint=container_entrypoint, kms_key_id=kms_key_id, job_name=job_name)
else:
parameters = processing_config(processor=processor, inputs=inputs, outputs=outputs, container_arguments=container_arguments, container_entrypoint=container_entrypoint, kms_key_id=kms_key_id)

if isinstance(job_name, (ExecutionInput, StepInput)):
parameters['ProcessingJobName'] = job_name

if experiment_config is not None:
parameters['ExperimentConfig'] = experiment_config

if tags:
parameters['Tags'] = tags_dict_to_kv_list(tags)

if 'S3Operations' in parameters:
del parameters['S3Operations']

kwargs[Field.Parameters.value] = parameters

super(ProcessingStep, self).__init__(state_id, **kwargs)
87 changes: 86 additions & 1 deletion tests/unit/test_sagemaker_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
from sagemaker.pipeline import PipelineModel
from sagemaker.model_monitor import DataCaptureConfig
from sagemaker.debugger import Rule, rule_configs, DebuggerHookConfig, CollectionConfig
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

from unittest.mock import MagicMock, patch
from stepfunctions.steps.sagemaker import TrainingStep, TransformStep, ModelStep, EndpointStep, EndpointConfigStep
from stepfunctions.steps.sagemaker import TrainingStep, TransformStep, ModelStep, EndpointStep, EndpointConfigStep, ProcessingStep
from stepfunctions.steps.sagemaker import tuning_config

from tests.unit.utils import mock_boto_api_call
Expand Down Expand Up @@ -156,6 +158,17 @@ def tensorflow_estimator():

return estimator

@pytest.fixture
def sklearn_processor():
processor = SKLearnProcessor(
framework_version="0.20.0",
role=EXECUTION_ROLE,
instance_type="ml.m5.xlarge",
instance_count=1,
)

return processor

@patch('botocore.client.BaseClient._make_api_call', new=mock_boto_api_call)
def test_training_step_creation(pca_estimator):
step = TrainingStep('Training',
Expand Down Expand Up @@ -566,3 +579,75 @@ def test_endpoint_step_creation(pca_model):
'Resource': 'arn:aws:states:::sagemaker:updateEndpoint',
'End': True
}

def test_processing_step_creation(sklearn_processor):
inputs = [ProcessingInput(source='dataset.csv', destination='/opt/ml/processing/input')]
outputs = [
ProcessingOutput(source='/opt/ml/processing/output/train'),
ProcessingOutput(source='/opt/ml/processing/output/validation'),
ProcessingOutput(source='/opt/ml/processing/output/test')
]
step = ProcessingStep('Feature Transformation', sklearn_processor, 'MyProcessingJob', inputs=inputs, outputs=outputs)
assert step.to_dict() == {
'Type': 'Task',
'Parameters': {
'AppSpecification': {
'ImageUri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3'
},
'ProcessingInputs': [
{
'InputName': None,
'S3Input': {
'LocalPath': '/opt/ml/processing/input',
'S3CompressionType': 'None',
'S3DataDistributionType': 'FullyReplicated',
'S3DataType': 'S3Prefix',
'S3InputMode': 'File',
'S3Uri': 'dataset.csv'
}
}
],
'ProcessingOutputConfig': {
'Outputs': [
{
'OutputName': None,
'S3Output': {
'LocalPath': '/opt/ml/processing/output/train',
'S3UploadMode': 'EndOfJob',
'S3Uri': None
}
},
{
'OutputName': None,
'S3Output': {
'LocalPath': '/opt/ml/processing/output/validation',
'S3UploadMode': 'EndOfJob',
'S3Uri': None
}
},
{
'OutputName': None,
'S3Output': {
'LocalPath': '/opt/ml/processing/output/test',
'S3UploadMode': 'EndOfJob',
'S3Uri': None
}
}
]
},
'ProcessingResources': {
'ClusterConfig': {
'InstanceCount': 1,
'InstanceType': 'ml.m5.xlarge',
'VolumeSizeInGB': 30
}
},
'ProcessingJobName': 'MyProcessingJob',
'RoleArn': EXECUTION_ROLE,
'StoppingCondition': {
'MaxRuntimeInSeconds': None
}
},
'Resource': 'arn:aws:states:::sagemaker:createProcessingJob.sync',
'End': True
}