mit-han-lab
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/example.py‎
Lines changed: 19 additions & 7 deletions b/‎examples/example.py‎
Lines changed: 19 additions & 7 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 3 deletions b/‎setup.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎torchsparse/nn/functional/conv.py‎
Lines changed: 7 additions & 2 deletions b/‎torchsparse/nn/functional/conv.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎torchsparse/nn/functional/devox.py‎
Lines changed: 3 additions & 0 deletions b/‎torchsparse/nn/functional/devox.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎torchsparse/nn/functional/downsample.py‎
Lines changed: 2 additions & 2 deletions b/‎torchsparse/nn/functional/downsample.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchsparse/nn/functional/voxelize.py‎
Lines changed: 6 additions & 2 deletions b/‎torchsparse/nn/functional/voxelize.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎torchsparse/src/common/gpu.cuh‎
Lines changed: 6 additions & 0 deletions b/‎torchsparse/src/common/gpu.cuh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎torchsparse/src/convolution/convolution.cpp‎
Lines changed: 0 additions & 154 deletions b/‎torchsparse/src/convolution/convolution.cpp‎
Lines changed: 0 additions & 154 deletions
@@ -107,6 +107,9 @@ We here provides an entire training example with dummy input [here](examples/exa
 
 You are also welcomed to check out our [SPVNAS](https://github.com/mit-han-lab/e3d) project to implement training / inference with real data.
 
+### Mixed Precision (float16) Support
+
+Mixed precision training is supported via `torch.cuda.amp.autocast` and `torch.cuda.amp.GradScaler`. Enabling mixed precision training can speed up training and reduce GPU memory usage. By wrapping your training code in a `torch.cuda.amp.autocast` block, feature tensors will automatically be converted to float16 if possible. See [here](examples/example.py) for a complete example. 
 
 ## Speed Comparison Between torchsparse and MinkowskiEngine
 
 
@@ -5,6 +5,7 @@
 import torchsparse.nn as spnn
 from torchsparse import SparseTensor
 from torchsparse.utils import sparse_collate_fn, sparse_quantize
+import argparse
 
 
 def generate_random_point_cloud(size=100000, voxel_size=0.2):
@@ -39,7 +40,7 @@ def generate_batched_random_point_clouds(size=100000,
  return sparse_collate_fn(batch)
 
 
-def dummy_train(device):
+def dummy_train(device, mixed=False):
  model = nn.Sequential(
  spnn.Conv3d(4, 32, kernel_size=3, stride=1), spnn.BatchNorm(32),
  spnn.ReLU(True), spnn.Conv3d(32, 64, kernel_size=2, stride=2),
@@ -50,21 +51,32 @@ def dummy_train(device):
  spnn.ReLU(True), spnn.Conv3d(32, 10, kernel_size=1)).to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
  criterion = nn.CrossEntropyLoss().to(device)
+ scaler = torch.cuda.amp.GradScaler(enabled=mixed)
 
  print('Starting dummy training...')
  for i in range(10):
+ optimizer.zero_grad()
  feed_dict = generate_batched_random_point_clouds()
  inputs = feed_dict['lidar'].to(device)
  targets = feed_dict['targets'].F.to(device).long()
- outputs = model(inputs)
- optimizer.zero_grad()
- loss = criterion(outputs.F, targets)
- loss.backward()
- optimizer.step()
+ with torch.cuda.amp.autocast(enabled=mixed):
+ outputs = model(inputs)
+ loss = criterion(outputs.F, targets)
+ scaler.scale(loss).backward()
+ scaler.step(optimizer)
+ scaler.update()
  print('[step %d] loss = %f.' % (i, loss.item()))
  print('Finished dummy training!')
 
 
 if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--mixed", action="store_true")
+ args = parser.parse_args()
+
+ # set seeds for reproducibility
+ np.random.seed(2021)
+ torch.manual_seed(2021)
+
  device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
- dummy_train(device)
+ dummy_train(device, args.mixed)
@@ -17,14 +17,13 @@
 file_lis = [
  'torchsparse/src/torchsparse_bindings_gpu.cpp',
  'torchsparse/src/convolution/convolution_cpu.cpp',
- 'torchsparse/src/convolution/convolution.cpp',
+ 'torchsparse/src/convolution/convolution.cu',
  'torchsparse/src/convolution/convolution_gpu.cu',
  'torchsparse/src/hash/hash_cpu.cpp',
  'torchsparse/src/hash/hash.cpp',
  'torchsparse/src/hash/hash_gpu.cu',
  'torchsparse/src/hashmap/hashmap.cu',
  'torchsparse/src/hashmap/hashmap_cpu.cpp',
- 'torchsparse/src/interpolation/devox.cpp',
  'torchsparse/src/interpolation/devox_gpu.cu',
  'torchsparse/src/interpolation/devox_deterministic.cpp',
  'torchsparse/src/interpolation/devox_deterministic_gpu.cu',
@@ -35,7 +34,6 @@
  'torchsparse/src/others/count.cpp',
  'torchsparse/src/others/count_gpu.cu',
  'torchsparse/src/others/count_cpu.cpp',
- 'torchsparse/src/others/insertion.cpp',
  'torchsparse/src/others/insertion_gpu.cu',
  'torchsparse/src/others/insertion_cpu.cpp',
  'torchsparse/src/others/query.cpp',
 
@@ -3,6 +3,7 @@
 import torch
 import torchsparse_backend
 from torch.autograd import Function
+from torch.cuda.amp import custom_fwd, custom_bwd
 from torchsparse import *
 from torchsparse.nn.functional.convert_neighbor_map import *
 from torchsparse.nn.functional.downsample import *
@@ -15,6 +16,7 @@
 
 class SpConvolution(Function):
  @staticmethod
+ @custom_fwd(cast_inputs=torch.half)
  def forward(ctx,
  features,
  kernel,
@@ -27,11 +29,13 @@ def forward(ctx,
  if not transpose:
  out = torch.zeros(sizes[1],
  kernel.size(-1),
+ dtype=features.dtype,
  device=features.device)
  else:
  # tbd: ensure the original, upsampled size to be the same.
  out = torch.zeros(sizes[0],
  kernel.size(-1),
+ dtype=features.dtype,
  device=features.device)
 
  if 'cuda' in str(features.device):
@@ -61,12 +65,13 @@ def forward(ctx,
  return out
 
  @staticmethod
+ @custom_bwd
  def backward(ctx, grad_out):
  features, kernel, neighbor_map, neighbor_offset, transpose = ctx.for_backwards
  K, c_in, c_out = kernel.size()
  N_in = features.size(0)
- grad_features = torch.zeros(N_in, c_in, device=features.device)
- grad_kernel = torch.zeros(K, c_in, c_out, device=kernel.device)
+ grad_features = torch.zeros(N_in, c_in, device=features.device, dtype=features.dtype)
+ grad_kernel = torch.zeros(K, c_in, c_out, device=kernel.device, dtype=features.dtype)
 
  if 'cuda' in str(features.device):
  torchsparse_backend.sparseconv_backward(features, grad_features,
 
@@ -1,6 +1,7 @@
 import torch
 import torchsparse_backend
 from torch.autograd import Function
+from torch.cuda.amp import custom_fwd, custom_bwd
 
 __all__ = ['spdevoxelize', 'calc_ti_weights']
 
@@ -61,6 +62,7 @@ def calc_ti_weights(pc, idx_query, scale=1.0):
 
 class DevoxelizationGPU(Function):
  @staticmethod
+ @custom_fwd(cast_inputs=torch.half)
  def forward(ctx, feat, indices, weights):
  if 'cuda' in str(feat.device):
  out = torchsparse_backend.devoxelize_forward(
@@ -77,6 +79,7 @@ def forward(ctx, feat, indices, weights):
  return out
 
  @staticmethod
+ @custom_bwd
  def backward(ctx, grad_out):
  indices, weights, n = ctx.for_backwards
 
 
@@ -2,6 +2,7 @@
 import torchsparse_backend
 from torch.autograd import Function
 from torchsparse.nn.functional.hash import *
+from torchsparse.nn.functional.voxelize import spvoxelize
 
 __all__ = ['spdownsample']
 
@@ -23,8 +24,7 @@ def forward(ctx, coords, ratio):
  # rounding is necessary
  # gpu
  if 'cuda' in str(coords.device):
- uq_coords = torch.round(
- torchsparse_backend.insertion_forward(coords_new.float(), inv,
+ uq_coords = torch.round(spvoxelize(coords_new.float(), inv,
  cnt))
  elif 'cpu' in str(coords.device):
  uq_coords = torch.round(
 
@@ -1,24 +1,28 @@
+import torch
 import torchsparse_backend
 from torch.autograd import Function
+from torch.cuda.amp import custom_fwd, custom_bwd
 from torchsparse.nn.functional.hash import *
 
 __all__ = ['spvoxelize']
 
 
 class VoxelizeGPU(Function):
  @staticmethod
+ @custom_fwd(cast_inputs=torch.half)
  def forward(ctx, feat, idx, cnt):
- out = torchsparse_backend.insertion_forward(feat.float().contiguous(),
+ out = torchsparse_backend.insertion_forward(feat.contiguous(),
  idx.int().contiguous(),
  cnt)
  ctx.for_backwards = (idx.int().contiguous(), cnt, feat.shape[0])
  return out
 
  @staticmethod
+ @custom_bwd
  def backward(ctx, top_grad):
  idx, cnt, N = ctx.for_backwards
  bottom_grad = torchsparse_backend.insertion_backward(
- top_grad.float().contiguous(), idx, cnt, N)
+ top_grad.contiguous(), idx, cnt, N)
  return bottom_grad, None, None
 
 
 
@@ -13,6 +13,7 @@
 #include <exception>
 #include <iostream>
 #include <vector>
+#include <torch/torch.h>
 
 
 //
@@ -103,6 +104,11 @@ template <typename Dtype1, typename Dtype2>
 void print(const thrust::device_vector<Dtype1> &v1,
  const thrust::device_vector<Dtype2> &v2);
 
+// atomicadd for half types (from aten/src/THC/THCAtomics.cuh)
+static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
+ return atomicAdd(reinterpret_cast<__half*>(address), val);
+}
+
 // AtomicAddition for double with cuda arch <= 600
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
 #else