Skip to content

Commit b2145bb

Browse files
authored
add notebook for debugger profiling blog (aws#1840)
* add notebook for debugger profiling blog * minor fix * update to be bug-free * clean up / finalilze * update to latest version * fix the inst num * minor fix * incorporate feedback * fix pysdk version Co-authored-by: Miyoung Choi <cmiyoung@amazon.com>
1 parent a7669d6 commit b2145bb

File tree

8 files changed

+1165
-0
lines changed

8 files changed

+1165
-0
lines changed
47.2 KB
Loading
284 KB
Loading
301 KB
Loading
400 KB
Loading
1.47 MB
Loading
67.2 KB
Loading

sagemaker-debugger/tensorflow_nlp_sentiment_analysis/sentiment-analysis-tf-distributed-training-bringyourownscript.ipynb

Lines changed: 1039 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import argparse
2+
import numpy as np
3+
import os
4+
import tensorflow.compat.v2 as tf
5+
import horovod.tensorflow.keras as hvd
6+
7+
8+
9+
max_features = 20000
10+
maxlen = 400
11+
embedding_dims = 300
12+
filters = 250
13+
kernel_size = 3
14+
hidden_dims = 250
15+
16+
17+
def parse_args():
18+
19+
parser = argparse.ArgumentParser()
20+
21+
# hyperparameters sent by the client are passed as command-line arguments to the script
22+
parser.add_argument('--epochs', type=int, default=1)
23+
parser.add_argument('--batch_size', type=int, default=64)
24+
25+
# data directories
26+
parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
27+
parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
28+
29+
# model directory: we will use the default set by SageMaker, /opt/ml/model
30+
parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
31+
32+
return parser.parse_known_args()
33+
34+
35+
def get_train_data(train_dir):
36+
37+
x_train = np.load(os.path.join(train_dir, 'x_train.npy'))
38+
y_train = np.load(os.path.join(train_dir, 'y_train.npy'))
39+
print('x train', x_train.shape,'y train', y_train.shape)
40+
41+
return x_train, y_train
42+
43+
44+
def get_test_data(test_dir):
45+
46+
x_test = np.load(os.path.join(test_dir, 'x_test.npy'))
47+
y_test = np.load(os.path.join(test_dir, 'y_test.npy'))
48+
print('x test', x_test.shape,'y test', y_test.shape)
49+
50+
return x_test, y_test
51+
52+
53+
def get_model():
54+
55+
embedding_layer = tf.keras.layers.Embedding(max_features,
56+
embedding_dims,
57+
input_length=maxlen)
58+
59+
sequence_input = tf.keras.Input(shape=(maxlen,), dtype='int32')
60+
embedded_sequences = embedding_layer(sequence_input)
61+
x = tf.keras.layers.Dropout(0.2)(embedded_sequences)
62+
x = tf.keras.layers.Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1)(x)
63+
x = tf.keras.layers.MaxPooling1D()(x)
64+
x = tf.keras.layers.GlobalMaxPooling1D()(x)
65+
x = tf.keras.layers.Dense(hidden_dims, activation='relu')(x)
66+
x = tf.keras.layers.Dropout(0.2)(x)
67+
preds = tf.keras.layers.Dense(1, activation='sigmoid')(x)
68+
69+
return tf.keras.Model(sequence_input, preds)
70+
71+
72+
if __name__ == "__main__":
73+
74+
args, _ = parse_args()
75+
76+
hvd.init()
77+
lr = 0.001
78+
# Horovod: pin GPU to be used to process local rank (one GPU per process)
79+
gpus = tf.config.experimental.list_physical_devices("GPU")
80+
for gpu in gpus:
81+
tf.config.experimental.set_memory_growth(gpu, True)
82+
if gpus:
83+
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
84+
85+
# Horovod: adjust learning rate based on number of GPUs.
86+
opt = tf.optimizers.Adam(lr * hvd.size())
87+
88+
# Horovod: add Horovod DistributedOptimizer.
89+
opt = hvd.DistributedOptimizer(opt)
90+
91+
x_train, y_train = get_train_data(args.train)
92+
x_test, y_test = get_test_data(args.test)
93+
94+
model = get_model()
95+
96+
model.compile(
97+
loss=tf.losses.SparseCategoricalCrossentropy(),
98+
optimizer=opt,
99+
metrics=["accuracy"],
100+
experimental_run_tf_function=False,
101+
)
102+
103+
callbacks = [
104+
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
105+
hvd.callbacks.MetricAverageCallback(),
106+
hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, verbose=1),
107+
]
108+
109+
if hvd.rank() == 0:
110+
callbacks.append(tf.keras.callbacks.ModelCheckpoint("checkpoint-{epoch}.h5"))
111+
112+
verbose = 1 if hvd.rank() == 0 else 0
113+
114+
#hook = KerasHook(out_dir='/tmp/test')
115+
model.fit(x_train, y_train,
116+
steps_per_epoch=500 // hvd.size(),
117+
callbacks=callbacks,
118+
batch_size=args.batch_size,
119+
epochs=args.epochs,
120+
validation_data=(x_test, y_test),
121+
)
122+
123+
# create a TensorFlow SavedModel for deployment to a SageMaker endpoint with TensorFlow Serving
124+
tf.saved_model.save(model, args.model_dir)
125+
126+

0 commit comments

Comments
 (0)