arunjose696
diff --git a/‎PyTorch/Segmentation/nnUNet/Dockerfile‎
Lines changed: 8 additions & 0 deletions b/‎PyTorch/Segmentation/nnUNet/Dockerfile‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎PyTorch/Segmentation/nnUNet/README.md‎
Lines changed: 8 additions & 11 deletions b/‎PyTorch/Segmentation/nnUNet/README.md‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎PyTorch/Segmentation/nnUNet/data_loading/dali_loader.py‎
Lines changed: 35 additions & 5 deletions b/‎PyTorch/Segmentation/nnUNet/data_loading/dali_loader.py‎
Lines changed: 35 additions & 5 deletions
diff --git a/‎PyTorch/Segmentation/nnUNet/models/layers.py‎
Lines changed: 0 additions & 49 deletions b/‎PyTorch/Segmentation/nnUNet/models/layers.py‎
Lines changed: 0 additions & 49 deletions
diff --git a/‎PyTorch/Segmentation/nnUNet/models/nn_unet.py‎
Lines changed: 17 additions & 25 deletions b/‎PyTorch/Segmentation/nnUNet/models/nn_unet.py‎
Lines changed: 17 additions & 25 deletions
@@ -6,6 +6,7 @@ WORKDIR /workspace/nnunet_pyt
 
 RUN pip install --upgrade pip
 RUN pip install --disable-pip-version-check -r requirements.txt
+RUN pip install --disable-pip-version-check -r triton/requirements.txt
 RUN pip install pytorch-lightning==1.0.0 --no-dependencies
 RUN pip install monai==0.4.0 --no-dependencies
 RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/ nvidia-dali-cuda110==0.30.0
@@ -14,3 +15,10 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2
 RUN unzip -qq awscliv2.zip
 RUN ./aws/install
 RUN rm -rf awscliv2.zip aws
+
+# Install Perf Client required library
+RUN apt-get update && apt-get install -y libb64-dev libb64-0d
+
+# Install Triton Client Python API and copy Perf Client
+#COPY --from=triton-client /workspace/install/ /workspace/install/
+#RUN pip install /workspace/install/python/triton*.whl
@@ -134,10 +134,6 @@ TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by defaul
 
 Test time augmentation is an inference technique which averages predictions from augmented images with its prediction. As a result, predictions are more accurate, but with the cost of slower inference process. For nnU-Net, we use all possible flip combinations for image augmenting. Test time augmentation can be enabled by adding the `--tta` flag.
 
-**Deep supervision**
-
-Deep supervision is a technique which adds auxiliary loss in U-Net decoder. For nnU-Net, we add auxiliary losses to all but the lowest two decoder levels. Final loss is the weighted average of losses. Deep supervision can be enabled by adding the `--deep_supervision` flag.
-
 ## Setup
 
 The following section lists the requirements that you need to meet in order to start training the nnU-Net model.
@@ -308,7 +304,7 @@ To see the full list of available options and their descriptions, use the `-h` o
 The following example output is printed when running the model:
 
 ```
-usage: main.py [-h] [--exec_mode {train,evaluate,predict}] [--data DATA] [--results RESULTS] [--logname LOGNAME] [--task TASK] [--gpus GPUS] [--learning_rate LEARNING_RATE] [--gradient_clip_val GRADIENT_CLIP_VAL] [--negative_slope NEGATIVE_SLOPE] [--tta] [--amp] [--benchmark] [--deep_supervision] [--drop_block] [--attention] [--residual] [--focal] [--sync_batchnorm] [--save_ckpt] [--nfolds NFOLDS] [--seed SEED] [--skip_first_n_eval SKIP_FIRST_N_EVAL] [--ckpt_path CKPT_PATH] [--fold FOLD] [--patience PATIENCE] [--lr_patience LR_PATIENCE] [--batch_size BATCH_SIZE] [--val_batch_size VAL_BATCH_SIZE] [--steps STEPS [STEPS ...]] [--profile] [--momentum MOMENTUM] [--weight_decay WEIGHT_DECAY] [--save_preds] [--dim {2,3}] [--resume_training] [--factor FACTOR] [--num_workers NUM_WORKERS] [--min_epochs MIN_EPOCHS] [--max_epochs MAX_EPOCHS] [--warmup WARMUP] [--norm {instance,batch,group}] [--nvol NVOL] [--data2d_dim {2,3}] [--oversampling OVERSAMPLING] [--overlap OVERLAP] [--affinity {socket,single,single_unique,socket_unique_interleaved,socket_unique_continuous,disabled}] [--scheduler {none,multistep,cosine,plateau}] [--optimizer {sgd,radam,adam}] [--blend {gaussian,constant}] [--train_batches TRAIN_BATCHES] [--test_batches TEST_BATCHES]
+usage: main.py [-h] [--exec_mode {train,evaluate,predict}] [--data DATA] [--results RESULTS] [--logname LOGNAME] [--task TASK] [--gpus GPUS] [--learning_rate LEARNING_RATE] [--gradient_clip_val GRADIENT_CLIP_VAL] [--negative_slope NEGATIVE_SLOPE] [--tta] [--amp] [--benchmark] [--residual] [--focal] [--sync_batchnorm] [--save_ckpt] [--nfolds NFOLDS] [--seed SEED] [--skip_first_n_eval SKIP_FIRST_N_EVAL] [--ckpt_path CKPT_PATH] [--fold FOLD] [--patience PATIENCE] [--lr_patience LR_PATIENCE] [--batch_size BATCH_SIZE] [--val_batch_size VAL_BATCH_SIZE] [--steps STEPS [STEPS ...]] [--profile] [--momentum MOMENTUM] [--weight_decay WEIGHT_DECAY] [--save_preds] [--dim {2,3}] [--resume_training] [--factor FACTOR] [--num_workers NUM_WORKERS] [--min_epochs MIN_EPOCHS] [--max_epochs MAX_EPOCHS] [--warmup WARMUP] [--norm {instance,batch,group}] [--nvol NVOL] [--data2d_dim {2,3}] [--oversampling OVERSAMPLING] [--overlap OVERLAP] [--affinity {socket,single,single_unique,socket_unique_interleaved,socket_unique_continuous,disabled}] [--scheduler {none,multistep,cosine,plateau}] [--optimizer {sgd,radam,adam}] [--blend {gaussian,constant}] [--train_batches TRAIN_BATCHES] [--test_batches TEST_BATCHES]
 
 optional arguments:
  -h, --help show this help message and exit
@@ -328,9 +324,6 @@ optional arguments:
  --tta Enable test time augmentation (default: False)
  --amp Enable automatic mixed precision (default: False)
  --benchmark Run model benchmarking (default: False)
- --deep_supervision Enable deep supervision (default: False)
- --drop_block Enable drop block (default: False)
- --attention Enable attention in decoder (default: False)
  --residual Enable residual block in encoder (default: False)
  --focal Use focal loss instead of cross entropy (default: False)
  --sync_batchnorm Enable synchronized batchnorm (default: False)
@@ -435,7 +428,7 @@ The default configuration minimizes a function `L = (1 - dice_coefficient) + cro
 The training can be run directly without using the predefined scripts. The name of the training script is `main.py`. For example:
 
 ```
-python main.py --exec_mode train --task 01 --fold 0 --gpus 1 --amp --deep_supervision
+python main.py --exec_mode train --task 01 --fold 0 --gpus 1 --amp
 ```
 
 Training artifacts will be saved to `/results` in the container. Some important artifacts are:
@@ -612,7 +605,7 @@ Our results were obtained by running the `python scripts/benchmark.py --mode pre
 
 FP16
 
-| Dimension | Batch size | Resolution | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+| Dimension | Batch size |Resolution| Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
 |:----------:|:---------:|:-------------:|:----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
 | 2 | 64 | 4x192x160 | 1866.52 | 34.29 | 34.7 | 48.87 | 52.44 |
 | 2 | 128 | 4x192x160 | 2032.74 | 62.97 | 63.21 | 63.25 | 63.32 |
@@ -622,7 +615,7 @@ FP16
 
 FP32
 
-| Dimension | Batch size | Resolution | Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
+| Dimension | Batch size |Resolution| Throughput Avg [img/s] | Latency Avg [ms] | Latency 90% [ms] | Latency 95% [ms] | Latency 99% [ms] |
 |:----------:|:---------:|:-------------:|:----------------------:|:----------------:|:----------------:|:----------------:|:----------------:|
 | 2 | 64 | 4x192x160 | 1051.46 | 60.87 | 61.21 | 61.48 | 62.87 |
 | 2 | 128 | 4x192x160 | 1051.68 | 121.71 | 122.29 | 122.44 | 122.6 |
@@ -638,6 +631,10 @@ To achieve these same results, follow the steps in the [Quick Start Guide](#quic
 
 ### Changelog
 
+May 2021
+- Add Triton Inference Server support
+- Removed deep supervision, attention and drop block
+
 March 2021
 - Container updated to 21.02
 - Change data format from tfrecord to npy and data loading for 2D 
 
@@ -160,6 +160,37 @@ def define_graph(self):
  return img, lbl
 
 
+class BermudaPipeline(Pipeline):
+ def __init__(self, batch_size, num_threads, device_id, **kwargs):
+ super(BermudaPipeline, self).__init__(batch_size, num_threads, device_id)
+ self.input_x = get_numpy_reader(
+ files=kwargs["imgs"],
+ shard_id=device_id,
+ num_shards=kwargs["gpus"],
+ seed=kwargs["seed"],
+ shuffle=False,
+ )
+ self.input_y = get_numpy_reader(
+ files=kwargs["lbls"],
+ shard_id=device_id,
+ num_shards=kwargs["gpus"],
+ seed=kwargs["seed"],
+ shuffle=False,
+ )
+ self.patch_size = kwargs["patch_size"]
+
+ def crop_fn(self, img, lbl):
+ img = fn.crop(img, crop=self.patch_size, out_of_bounds_policy="pad")
+ lbl = fn.crop(lbl, crop=self.patch_size, out_of_bounds_policy="pad")
+ return img, lbl
+
+ def define_graph(self):
+ img, lbl = self.input_x(name="ReaderX"), self.input_y(name="ReaderY")
+ img, lbl = fn.reshape(img, layout="CDHW"), fn.reshape(lbl, layout="CDHW")
+ img, lbl = self.crop_fn(img, lbl)
+ return img, lbl
+
+
 class TestPipeline(Pipeline):
  def __init__(self, batch_size, num_threads, device_id, **kwargs):
  super(TestPipeline, self).__init__(batch_size, num_threads, device_id)
@@ -249,11 +280,6 @@ def fetch_dali_loader(imgs, lbls, batch_size, mode, **kwargs):
  nbs *= batch_size
  imgs = list(itertools.chain(*(100 * [imgs])))[: nbs * kwargs["gpus"]]
  lbls = list(itertools.chain(*(100 * [lbls])))[: nbs * kwargs["gpus"]]
- if mode == "eval":
- reminder = len(imgs) % kwargs["gpus"]
- if reminder != 0:
- imgs = imgs[:-reminder]
- lbls = lbls[:-reminder]
 
  pipe_kwargs = {
  "imgs": imgs,
@@ -284,6 +310,10 @@ def fetch_dali_loader(imgs, lbls, batch_size, mode, **kwargs):
  pipeline = EvalPipeline
  output_map = ["image", "label"]
  dynamic_shape = True
+ elif mode == "bermuda":
+ pipeline = BermudaPipeline
+ output_map = ["image", "label"]
+ dynamic_shape = False
  else:
  pipeline = TestPipeline
  output_map = ["image", "meta"]
 
@@ -15,7 +15,6 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from dropblock import DropBlock3D, LinearScheduler
 
 normalizations = {
  "instancenorm3d": nn.InstanceNorm3d,
@@ -68,30 +67,16 @@ def get_output_padding(kernel_size, stride, padding):
  return out_padding if len(out_padding) > 1 else out_padding[0]
 
 
-def get_drop_block():
- return LinearScheduler(
- DropBlock3D(block_size=5, drop_prob=0.0),
- start_value=0.0,
- stop_value=0.1,
- nr_steps=10000,
- )
-
 
 class ConvLayer(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
  super(ConvLayer, self).__init__()
  self.conv = get_conv(in_channels, out_channels, kernel_size, stride, kwargs["dim"])
  self.norm = get_norm(kwargs["norm"], out_channels)
  self.lrelu = nn.LeakyReLU(negative_slope=kwargs["negative_slope"], inplace=True)
- self.use_drop_block = kwargs["drop_block"]
- if self.use_drop_block:
- self.drop_block = get_drop_block()
 
  def forward(self, data):
  out = self.conv(data)
- if self.use_drop_block:
- self.drop_block.step()
- out = self.drop_block(out)
  out = self.norm(out)
  out = self.lrelu(out)
  return out
@@ -116,10 +101,6 @@ def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
  self.conv2 = get_conv(out_channels, out_channels, kernel_size, 1, kwargs["dim"])
  self.norm = get_norm(kwargs["norm"], out_channels)
  self.lrelu = nn.LeakyReLU(negative_slope=kwargs["negative_slope"], inplace=True)
- self.use_drop_block = kwargs["drop_block"]
- if self.use_drop_block:
- self.drop_block = get_drop_block()
- self.skip_drop_block = get_drop_block()
  self.downsample = None
  if max(stride) > 1 or in_channels != out_channels:
  self.downsample = get_conv(in_channels, out_channels, kernel_size, stride, kwargs["dim"])
@@ -129,52 +110,22 @@ def forward(self, input_data):
  residual = input_data
  out = self.conv1(input_data)
  out = self.conv2(out)
- if self.use_drop_block:
- out = self.drop_block(out)
  out = self.norm(out)
  if self.downsample is not None:
  residual = self.downsample(residual)
- if self.use_drop_block:
- residual = self.skip_drop_block(residual)
  residual = self.norm_res(residual)
  out = self.lrelu(out + residual)
  return out
 
 
-class AttentionLayer(nn.Module):
- def __init__(self, in_channels, out_channels, norm, dim):
- super(AttentionLayer, self).__init__()
- self.conv = get_conv(in_channels, out_channels, kernel_size=3, stride=1, dim=dim)
- self.norm = get_norm(norm, out_channels)
-
- def forward(self, inputs):
- out = self.conv(inputs)
- out = self.norm(out)
- return out
-
-
 class UpsampleBlock(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size, stride, **kwargs):
  super(UpsampleBlock, self).__init__()
  self.transp_conv = get_transp_conv(in_channels, out_channels, stride, stride, kwargs["dim"])
  self.conv_block = ConvBlock(2 * out_channels, out_channels, kernel_size, 1, **kwargs)
- self.attention = kwargs["attention"]
- if self.attention:
- att_out, norm, dim = out_channels // 2, kwargs["norm"], kwargs["dim"]
- self.conv_o = AttentionLayer(out_channels, att_out, norm, dim)
- self.conv_s = AttentionLayer(out_channels, att_out, norm, dim)
- self.psi = AttentionLayer(att_out, 1, norm, dim)
- self.sigmoid = nn.Sigmoid()
- self.relu = nn.ReLU(inplace=True)
 
  def forward(self, input_data, skip_data):
  out = self.transp_conv(input_data)
- if self.attention:
- out_a = self.conv_o(out)
- skip_a = self.conv_s(skip_data)
- psi_a = self.psi(self.relu(out_a + skip_a))
- attention = self.sigmoid(psi_a)
- skip_data = skip_data * attention
  out = torch.cat((out, skip_data), dim=1)
  out = self.conv_block(out)
  return out
 
@@ -39,28 +39,33 @@
 
 
 class NNUnet(pl.LightningModule):
- def __init__(self, args):
+ def __init__(self, args, bermuda=False, data_dir=None):
  super(NNUnet, self).__init__()
  self.args = args
- if not hasattr(self.args, "drop_block"): # For backward compability
- self.args.drop_block = False
+ self.bermuda = bermuda
+ if data_dir is not None:
+ self.args.data = data_dir
  self.save_hyperparameters()
  self.build_nnunet()
- self.loss = Loss(self.args.focal)
- self.dice = Dice(self.n_class)
  self.best_sum = 0
  self.best_sum_epoch = 0
  self.best_dice = self.n_class * [0]
  self.best_epoch = self.n_class * [0]
  self.best_sum_dice = self.n_class * [0]
- self.learning_rate = args.learning_rate
- self.tta_flips = get_tta_flips(args.dim)
  self.test_idx = 0
  self.test_imgs = []
- if self.args.exec_mode in ["train", "evaluate"]:
- self.dllogger = get_dllogger(args.results)
+ if not self.bermuda:
+ self.learning_rate = args.learning_rate
+ self.loss = Loss(self.args.focal)
+ self.tta_flips = get_tta_flips(args.dim)
+ self.dice = Dice(self.n_class)
+ if self.args.exec_mode in ["train", "evaluate"]:
+ self.dllogger = get_dllogger(args.results)
 
  def forward(self, img):
+ return torch.argmax(self.model(img), 1)
+
+ def _forward(self, img):
  if self.args.benchmark:
  if self.args.dim == 2 and self.args.data2d_dim == 3:
  img = layout_2d(img, None)
@@ -70,14 +75,14 @@ def forward(self, img):
  def training_step(self, batch, batch_idx):
  img, lbl = self.get_train_data(batch)
  pred = self.model(img)
- loss = self.compute_loss(pred, lbl)
+ loss = self.loss(pred, lbl)
  return loss
 
  def validation_step(self, batch, batch_idx):
  if self.current_epoch < self.args.skip_first_n_eval:
  return None
  img, lbl = batch["image"], batch["label"]
- pred = self.forward(img)
+ pred = self._forward(img)
  loss = self.loss(pred, lbl)
  self.dice.update(pred, lbl[:, 0])
  return {"val_loss": loss}
@@ -86,7 +91,7 @@ def test_step(self, batch, batch_idx):
  if self.args.exec_mode == "evaluate":
  return self.validation_step(batch, batch_idx)
  img = batch["image"]
- pred = self.forward(img)
+ pred = self._forward(img)
  if self.args.save_preds:
  meta = batch["meta"][0].cpu().detach().numpy()
  original_shape = meta[2]
@@ -120,25 +125,12 @@ def build_nnunet(self):
  strides=strides,
  dimension=self.args.dim,
  residual=self.args.residual,
- attention=self.args.attention,
- drop_block=self.args.drop_block,
  normalization_layer=self.args.norm,
  negative_slope=self.args.negative_slope,
- deep_supervision=self.args.deep_supervision,
  )
  if is_main_process():
  print(f"Filters: {self.model.filters},\nKernels: {kernels}\nStrides: {strides}")
 
- def compute_loss(self, preds, label):
- if self.args.deep_supervision:
- loss = self.loss(preds[0], label)
- for i, pred in enumerate(preds[1:]):
- downsampled_label = nn.functional.interpolate(label, pred.shape[2:])
- loss += 0.5 ** (i + 1) * self.loss(pred, downsampled_label)
- c_norm = 1 / (2 - 2 ** (-len(preds)))
- return c_norm * loss
- return self.loss(preds, label)
-
  def do_inference(self, image):
  if self.args.dim == 3:
  return self.sliding_window_inference(image)