|
108 | 108 | "metadata": {}, |
109 | 109 | "source": [ |
110 | 110 | "### 1. Accept `--model_dir` command-line argument\n", |
111 | | - "Modify script to accept `--model_dir` as command-line argument which will define the directory path where the output model should be saved. This will be equal to `/opt/ml/model/`\n", |
| 111 | + "Modify script to accept `--model_dir` as command-line argument which will define the directory path (i.e. `/opt/ml/model/`) where the output model should be saved. As Sagemaker destroys the complete cluster at the end of training, saving the model to `/opt/ml/model/` directory preserves the trained model from getting lost as SageMaker at the end of trainig pushes all the data in `/opt/ml/model/` to s3. \n", |
| 112 | + "\n", |
| 113 | + "This also allows the SageMaker training to integrate with other SageMaker services such as Inference and also allows you to host the trained model outside SageMaker.\n", |
112 | 114 | "\n", |
113 | 115 | "Here is the code that needs to be added to script:\n", |
114 | 116 | "\n", |
115 | 117 | "```\n", |
116 | 118 | "parser = argparse.ArgumentParser()\n", |
117 | 119 | "parser.add_argument('--model_dir', type=str)\n", |
118 | | - "```" |
| 120 | + "```\n", |
| 121 | + "\n", |
| 122 | + "More details can be found [here](https://github.com/aws/sagemaker-containers/blob/master/README.md)." |
119 | 123 | ] |
120 | 124 | }, |
121 | 125 | { |
|
135 | 139 | "\n", |
136 | 140 | "x_test = np.load(os.path.join(os.environ['SM_CHANNEL_TEST'], 'test.npz'))['data']\n", |
137 | 141 | "y_test = np.load(os.path.join(os.environ['SM_CHANNEL_TEST'], 'test.npz'))['labels']\n", |
138 | | - "```\n" |
| 142 | + "```\n", |
| 143 | + "\n", |
| 144 | + "List of all environemnt variables set by SageMaker which are accessible inside training script can be found [here](https://github.com/aws/sagemaker-containers/blob/master/README.md)." |
139 | 145 | ] |
140 | 146 | }, |
141 | 147 | { |
|
445 | 451 | "source": [ |
446 | 452 | "import boto3\n", |
447 | 453 | "from botocore.exceptions import ClientError\n", |
448 | | - "\n", |
| 454 | + "from time import sleep\n", |
449 | 455 | "\n", |
450 | 456 | "def create_vpn_infra(stack_name=\"hvdvpcstack\"):\n", |
451 | 457 | " cfn = boto3.client(\"cloudformation\")\n", |
|
466 | 472 | "\n", |
467 | 473 | " while describe_stack[\"StackStatus\"] == \"CREATE_IN_PROGRESS\":\n", |
468 | 474 | " describe_stack = cfn.describe_stacks(StackName=stack_name)[\"Stacks\"][0]\n", |
| 475 | + " sleep(0.5)\n", |
469 | 476 | "\n", |
470 | 477 | " if describe_stack[\"StackStatus\"] != \"CREATE_COMPLETE\":\n", |
471 | 478 | " raise ValueError(\"Stack creation failed in state: {}\".format(describe_stack[\"StackStatus\"]))\n", |
|
0 commit comments