|
49 | 49 | "import sagemaker\n", |
50 | 50 | "from time import sleep\n", |
51 | 51 | "from collections import Counter\n", |
| 52 | + "import numpy as np\n", |
52 | 53 | "import pandas as pd\n", |
53 | 54 | "from sagemaker import get_execution_role, local, Model, utils, fw_utils, s3\n", |
54 | 55 | "from sagemaker.estimator import Estimator\n", |
|
106 | 107 | "source": [ |
107 | 108 | "if not os.path.exists('package'):\n", |
108 | 109 | " !pip install PrettyTable -t package\n", |
109 | | - " !pip install bokeh -t package\n", |
110 | | - " !pip install --pre autogluon -t package\n", |
111 | | - " !pip install numpy==1.16.1 -t package \n", |
112 | 110 | " !pip install --upgrade boto3 -t package\n", |
113 | 111 | " !pip install bokeh -t package\n", |
114 | | - " !pip install --upgrade matplotlib -t package" |
| 112 | + " !pip install --upgrade matplotlib -t package\n", |
| 113 | + " !pip install autogluon -t package" |
115 | 114 | ] |
116 | 115 | }, |
117 | 116 | { |
|
255 | 254 | "Collapsed": "false" |
256 | 255 | }, |
257 | 256 | "source": [ |
258 | | - "## Train" |
259 | | - ] |
260 | | - }, |
261 | | - { |
262 | | - "cell_type": "markdown", |
263 | | - "metadata": { |
264 | | - "Collapsed": "false" |
265 | | - }, |
266 | | - "source": [ |
267 | | - "The minimum requirement for hyperparameters is a target label." |
268 | | - ] |
269 | | - }, |
270 | | - { |
271 | | - "cell_type": "code", |
272 | | - "execution_count": null, |
273 | | - "metadata": { |
274 | | - "Collapsed": "false" |
275 | | - }, |
276 | | - "outputs": [], |
277 | | - "source": [ |
278 | | - "hyperparameters = {'label': 'y'}" |
279 | | - ] |
280 | | - }, |
281 | | - { |
282 | | - "cell_type": "markdown", |
283 | | - "metadata": { |
284 | | - "Collapsed": "false" |
285 | | - }, |
286 | | - "source": [ |
287 | | - "##### (Optional) hyperparameters can be passed to the `autogluon.task.TabularPrediction.fit` function. \n", |
| 257 | + "## Hyperparameter Selection\n", |
288 | 258 | "\n", |
289 | | - "Below shows AutoGluon hyperparameters from the example [Predicting Columns in a Table - In Depth](https://autogluon.mxnet.io/tutorials/tabular_prediction/tabular-indepth.html#model-ensembling-with-stacking-bagging). Please see [fit parameters](https://autogluon.mxnet.io/api/autogluon.task.html?highlight=eval_metric#autogluon.task.TabularPrediction.fit) for further information.\n", |
| 259 | + "The minimum required settings for training is just a target label, `fit_args['label']`.\n", |
290 | 260 | "\n", |
| 261 | + "Additional optional hyperparameters can be passed to the `autogluon.task.TabularPrediction.fit` function via `fit_args`.\n", |
291 | 262 | "\n", |
292 | | - "Here's a more in depth example from the above tutorial that shows how to provide hyperparameter ranges and additional settings:\n", |
| 263 | + "Below shows a more in depth example of AutoGluon-Tabular hyperparameters from the example [Predicting Columns in a Table - In Depth](https://autogluon.mxnet.io/tutorials/tabular_prediction/tabular-indepth.html#model-ensembling-with-stacking-bagging). Please see [fit parameters](https://autogluon.mxnet.io/api/autogluon.task.html?highlight=eval_metric#autogluon.task.TabularPrediction.fit) for further information. Note that in order for hyperparameter ranges to work in SageMaker, values passed to the `fit_args['hyperparameters']` must be represented as strings.\n", |
293 | 264 | "\n", |
294 | 265 | "```python\n", |
295 | 266 | "nn_options = {\n", |
296 | | - " 'num_epochs': '10',\n", |
| 267 | + " 'num_epochs': \"10\",\n", |
297 | 268 | " 'learning_rate': \"ag.space.Real(1e-4, 1e-2, default=5e-4, log=True)\",\n", |
298 | 269 | " 'activation': \"ag.space.Categorical('relu', 'softrelu', 'tanh')\",\n", |
299 | 270 | " 'layers': \"ag.space.Categorical([100],[1000],[200,100],[300,200,100])\",\n", |
300 | 271 | " 'dropout_prob': \"ag.space.Real(0.0, 0.5, default=0.1)\"\n", |
301 | 272 | "}\n", |
302 | 273 | "\n", |
303 | 274 | "gbm_options = {\n", |
304 | | - " 'num_boost_round': '100',\n", |
| 275 | + " 'num_boost_round': \"100\",\n", |
305 | 276 | " 'num_leaves': \"ag.space.Int(lower=26, upper=66, default=36)\"\n", |
306 | 277 | "}\n", |
307 | 278 | "\n", |
308 | 279 | "model_hps = {'NN': nn_options, 'GBM': gbm_options} \n", |
309 | 280 | "\n", |
| 281 | + "fit_args = {\n", |
| 282 | + " 'label': 'y',\n", |
| 283 | + " 'presets': ['best_quality', 'optimize_for_deployment'],\n", |
| 284 | + " 'time_limits': 60*10,\n", |
| 285 | + " 'hyperparameters': model_hps,\n", |
| 286 | + " 'hyperparameter_tune': True,\n", |
| 287 | + " 'search_strategy': 'skopt'\n", |
| 288 | + "}\n", |
| 289 | + "\n", |
310 | 290 | "hyperparameters = {\n", |
311 | | - " 'label': 'y',\n", |
312 | | - " 'time_limits': 2*60,\n", |
313 | | - " 'hyperparameters': model_hps,\n", |
314 | | - " 'auto_stack': False, \n", |
315 | | - " 'hyperparameter_tune': True,\n", |
316 | | - " 'search_strategy': 'skopt'\n", |
| 291 | + " 'fit_args': fit_args,\n", |
| 292 | + " 'feature_importance': True\n", |
317 | 293 | "}\n", |
318 | 294 | "```\n", |
319 | | - "**Note:** Your hyperparameter choices may affect the size of the model package, which could result in additional time taken to upload your model and complete training.\n", |
| 295 | + "**Note:** Your hyperparameter choices may affect the size of the model package, which could result in additional time taken to upload your model and complete training. Including `'optimize_for_deployment'` in the list of `fit_args['presets']` is recommended to greatly reduce upload times.\n", |
320 | 296 | "\n", |
321 | 297 | "<br>" |
322 | 298 | ] |
323 | 299 | }, |
| 300 | + { |
| 301 | + "cell_type": "code", |
| 302 | + "execution_count": null, |
| 303 | + "metadata": { |
| 304 | + "Collapsed": "false" |
| 305 | + }, |
| 306 | + "outputs": [], |
| 307 | + "source": [ |
| 308 | + "# Define required label and optional additional parameters\n", |
| 309 | + "fit_args = {\n", |
| 310 | + " 'label': 'y',\n", |
| 311 | + " # Adding 'best_quality' to presets list will result in better performance (but longer runtime)\n", |
| 312 | + " 'presets': ['optimize_for_deployment'],\n", |
| 313 | + "}\n", |
| 314 | + "\n", |
| 315 | + "# Pass fit_args to SageMaker estimator hyperparameters\n", |
| 316 | + "hyperparameters = {\n", |
| 317 | + " 'fit_args': fit_args,\n", |
| 318 | + " 'feature_importance': True\n", |
| 319 | + "}" |
| 320 | + ] |
| 321 | + }, |
324 | 322 | { |
325 | 323 | "cell_type": "markdown", |
326 | 324 | "metadata": { |
327 | 325 | "Collapsed": "false" |
328 | 326 | }, |
329 | 327 | "source": [ |
330 | | - "For local training set `train_instance_type` to `local` . \n", |
331 | | - "For non-local training the recommended instance type is `ml.m5.2xlarge` ." |
| 328 | + "## Train\n", |
| 329 | + "\n", |
| 330 | + "For local training set `train_instance_type` to `local` . \n", |
| 331 | + "For non-local training the recommended instance type is `ml.m5.2xlarge`. \n", |
| 332 | + "\n", |
| 333 | + "**Note:** Depending on how many underlying models are trained, `train_volume_size` may need to be increased so that they all fit on disk." |
332 | 334 | ] |
333 | 335 | }, |
334 | 336 | { |
|
350 | 352 | " role=role,\n", |
351 | 353 | " train_instance_count=1,\n", |
352 | 354 | " train_instance_type=instance_type,\n", |
353 | | - " hyperparameters=hyperparameters)\n", |
| 355 | + " hyperparameters=hyperparameters,\n", |
| 356 | + " train_volume_size=100)\n", |
354 | 357 | "\n", |
355 | | - "estimator.fit(train_s3_path)" |
| 358 | + "# Set inputs. Test data is optional, but requires a label column.\n", |
| 359 | + "inputs = {'training': train_s3_path, 'testing': test_s3_path}\n", |
| 360 | + "\n", |
| 361 | + "estimator.fit(inputs)" |
356 | 362 | ] |
357 | 363 | }, |
358 | 364 | { |
|
516 | 522 | }, |
517 | 523 | "outputs": [], |
518 | 524 | "source": [ |
519 | | - "results = predictor.predict(X_test.to_csv())\n", |
| 525 | + "results = predictor.predict(X_test.to_csv()).splitlines()\n", |
520 | 526 | "\n", |
521 | 527 | "# Check output\n", |
522 | | - "print(Counter(results.splitlines()))" |
| 528 | + "print(Counter(results))" |
523 | 529 | ] |
524 | 530 | }, |
525 | 531 | { |
|
540 | 546 | }, |
541 | 547 | "outputs": [], |
542 | 548 | "source": [ |
543 | | - "results = predictor.predict(test.to_csv())\n", |
| 549 | + "results = predictor.predict(test.to_csv()).splitlines()\n", |
544 | 550 | "\n", |
545 | 551 | "# Check output\n", |
546 | | - "sleep(0.1); print(Counter(results.splitlines()))" |
| 552 | + "print(Counter(results))" |
547 | 553 | ] |
548 | 554 | }, |
549 | 555 | { |
|
552 | 558 | "Collapsed": "false" |
553 | 559 | }, |
554 | 560 | "source": [ |
555 | | - "##### Check that performance metrics match evaluation printed to endpoint logs as expected" |
| 561 | + "##### Check that classification performance metrics match evaluation printed to endpoint logs as expected" |
556 | 562 | ] |
557 | 563 | }, |
558 | 564 | { |
|
563 | 569 | }, |
564 | 570 | "outputs": [], |
565 | 571 | "source": [ |
566 | | - "import numpy as np\n", |
567 | | - "y_results = np.array(results.splitlines())\n", |
| 572 | + "y_results = np.array(results)\n", |
568 | 573 | "\n", |
569 | 574 | "print(\"accuracy: {}\".format(accuracy_score(y_true=y_test, y_pred=y_results)))\n", |
570 | 575 | "print(classification_report(y_true=y_test, y_pred=y_results, digits=6))" |
|
593 | 598 | ], |
594 | 599 | "metadata": { |
595 | 600 | "kernelspec": { |
596 | | - "display_name": "conda_mxnet_p36", |
| 601 | + "display_name": "Environment (conda_mxnet_p36)", |
597 | 602 | "language": "python", |
598 | 603 | "name": "conda_mxnet_p36" |
599 | 604 | }, |
|
0 commit comments