Rust-GPU
diff --git a/‎crates/rustc_codegen_nvvm/src/builder.rs‎
Lines changed: 152 additions & 9 deletions b/‎crates/rustc_codegen_nvvm/src/builder.rs‎
Lines changed: 152 additions & 9 deletions
diff --git a/‎crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs‎
Lines changed: 6 additions & 0 deletions b/‎crates/rustc_codegen_nvvm/src/ctx_intrinsics.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎crates/rustc_codegen_nvvm/src/llvm.rs‎
Lines changed: 47 additions & 1 deletion b/‎crates/rustc_codegen_nvvm/src/llvm.rs‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎tests/compiletests/ui/atomic/std_atomic_ops.rs‎
Lines changed: 23 additions & 0 deletions b/‎tests/compiletests/ui/atomic/std_atomic_ops.rs‎
Lines changed: 23 additions & 0 deletions
@@ -1134,16 +1134,64 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
  // Atomic Operations
  fn atomic_cmpxchg(
  &mut self,
- _dst: &'ll Value,
- _cmp: &'ll Value,
- _src: &'ll Value,
- _order: AtomicOrdering,
- _failure_order: AtomicOrdering,
- _weak: bool,
+ dst: &'ll Value,
+ cmp: &'ll Value,
+ src: &'ll Value,
+ order: AtomicOrdering,
+ failure_order: AtomicOrdering,
+ weak: bool,
  ) -> (&'ll Value, &'ll Value) {
- // allowed but only for some things and with restrictions
- // https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#cmpxchg-instruction
- self.fatal("atomic cmpxchg is not supported")
+ // LLVM verifier rejects cases where the `failure_order` is stronger than `order`
+ match (order,failure_order){
+ (AtomicOrdering::SeqCst, _)=>(),
+ (_, AtomicOrdering::Relaxed)=>(),
+ (AtomicOrdering::Release, AtomicOrdering::Release) | (AtomicOrdering::Release, AtomicOrdering::Acquire) | (AtomicOrdering::Acquire, AtomicOrdering::Acquire)=>(),
+ (AtomicOrdering::AcqRel,AtomicOrdering::Acquire) => (),
+ (AtomicOrdering::Relaxed, _) | (_, AtomicOrdering::Release | AtomicOrdering::AcqRel | AtomicOrdering::SeqCst)=>{
+ // Invalid cmpxchg - `failure_order` is stronger than `order`! So, we abort.
+ self.abort();
+ return (self.const_undef(self.val_ty(cmp)),self.const_undef(self.type_i1()));
+ }
+ };
+ let res = self.atomic_op(
+ dst,
+ |builder, dst| {
+ // We are in a supported address space - just use ordinary atomics
+ unsafe {
+ llvm::LLVMRustBuildAtomicCmpXchg(
+ builder.llbuilder,
+ dst,
+ cmp,
+ src,
+ crate::llvm::AtomicOrdering::from_generic( order),
+ crate::llvm::AtomicOrdering::from_generic(failure_order),
+ weak as u32,
+ )
+ }
+ },
+ |builder, dst| {
+ // Local space is only accessible to the current thread.
+ // So, there are no synchronization issues, and we can emulate it using a simple load / compare / store. 
+ let load:&'ll Value = unsafe{ llvm::LLVMBuildLoad(builder.llbuilder, dst, UNNAMED) };
+ let compare = builder.icmp(IntPredicate::IntEQ, load, cmp);
+ // We can do something smart & branchless here:
+ // We select either the current value(if the comparison fails), or a new value. 
+ // We then *undconditionally* write that back to local memory(which is very, very cheap).
+ // TODO: measure if this has a positive impact, or if we should just use more blocks, and conditional writes.
+ let value = builder.select(compare, src, load);
+ unsafe { llvm::LLVMBuildStore(builder.llbuilder, value, dst)};
+ let res_type = builder.type_struct(&[builder.val_ty(cmp),builder.type_ix(1)], false);
+ // We pack the result, to match the behaviour of proper atomics / emulated thread-local atomics. 
+ let res = builder.const_undef(res_type);
+ let res = builder.insert_value(res, load, 0);
+ let res = builder.insert_value(res, compare, 1);
+ res
+ },
+ );
+ // Unpack the result
+ let val = self.extract_value(res, 0);
+ let success = self.extract_value(res, 1);
+ (val, success)
  }
  fn atomic_rmw(
  &mut self,
@@ -1609,3 +1657,98 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
  }
  }
 }
+impl<'ll, 'tcx, 'a> Builder<'a, 'll, 'tcx> {
+ /// Implements a standard atomic, using LLVM intrinsics(in `atomic_supported`, if `dst` is in a supported address space)
+ /// or emulation(with `emulate_local`, if `dst` points to a thread-local address space).
+ fn atomic_op(
+ &mut self,
+ dst: &'ll Value,
+ atomic_supported: impl FnOnce(&mut Builder<'a,'ll ,'tcx>, &'ll Value) -> &'ll Value,
+ emulate_local: impl FnOnce(&mut Builder<'a,'ll ,'tcx>, &'ll Value) -> &'ll Value,
+ ) -> &'ll Value {
+ // (FractalFir) Atomics in CUDA have some limitations, and we have to work around them.
+ // For example, they are restricted in what address space they operate on.
+ // CUDA has 4 address spaces(and a generic one, which is an union of all of those).
+ // An atomic instruction can soundly operate on:
+ // 1. The global address space
+ // 2. The shared(cluster) address space.
+ // It can't operate on:
+ // 1. The const address space(atomics on consts are UB anyway)
+ // 2. The thread address space(which should be only accessible to 1 thread, anyway?)
+ // So, we do the following:
+ // 1. Check if the pointer is in one of the address spaces atomics support.
+ // a) if so, we perform an atomic operation
+ // 2. Check if the pointer is in the thread-local address space. If it is, we use non-atomic ops here,
+ // **ASSUMING** only the current thread can access thread-local memory. (FIXME: is this sound?)
+ // 3. If the pointer is not in a supported address space, and is not thread-local, then we bail, and trap.
+
+ // We check if the `dst` pointer is in the `global` address space.
+ let (isspacep_global_ty, isspacep_global_fn) =
+ self.get_intrinsic("llvm.nvvm.isspacep.global");
+ let isspacep_global = self.call(
+ isspacep_global_ty,
+ None,
+ None,
+ isspacep_global_fn,
+ &[dst],
+ None,
+ None,
+ );
+ // We check if the `dst` pointer is in the `shared` address space.
+ let (isspacep_shared_ty, isspacep_shared_fn) =
+ self.get_intrinsic("llvm.nvvm.isspacep.shared");
+ let isspacep_shared = self.call(
+ isspacep_shared_ty,
+ None,
+ None,
+ isspacep_shared_fn,
+ &[dst],
+ None,
+ None,
+ );
+ // Combine those to check if we are in a supported address space.
+ let atomic_supported_addrspace = self.or(isspacep_shared, isspacep_global);
+ // We create 2 blocks here: one we branch to if atomic is in the right address space, and one we branch to otherwise.
+ let supported_bb = self.append_sibling_block("atomic_space_supported");
+ let unsupported_bb = self.append_sibling_block("atomic_space_unsupported");
+ self.cond_br(atomic_supported_addrspace, supported_bb, unsupported_bb);
+ // We also create a "merge" block we will jump to, after the the atomic ops finish.
+ let merge_bb = self.append_sibling_block("atomic_op_done");
+ // Execute atomic op if supported, then jump to merge
+ self.switch_to_block(supported_bb);
+ let supported_res = atomic_supported(self, dst);
+ self.br(merge_bb);
+ // Check if the pointer is in the thread space. If so, we can emulate it.
+ self.switch_to_block(unsupported_bb);
+ let (isspacep_local_ty, isspacep_local_fn) = self.get_intrinsic("llvm.nvvm.isspacep.local");
+ let isspacep_local = self.call(
+ isspacep_local_ty,
+ None,
+ None,
+ isspacep_local_fn,
+ &[dst],
+ None,
+ None,
+ );
+ let local_bb = self.append_sibling_block("atomic_local_space");
+ let atomic_ub_bb = self.append_sibling_block("atomic_space_ub");
+ self.cond_br(isspacep_local, local_bb, atomic_ub_bb);
+ // The pointer is in the thread(local) space.
+ self.switch_to_block(local_bb);
+ let local_res = emulate_local(self, dst);
+ self.br(merge_bb);
+ // The pointer is neither in the supported address space, nor the local space.
+ // This is very likely UB. So, we trap here.
+ // TODO: should we print some kind of a message here? NVVM supports printf.
+ self.switch_to_block(atomic_ub_bb);
+ self.abort();
+ self.unreachable();
+ // Atomic is impl has finished, and we can now switch to the merge_bb
+ self.switch_to_block(merge_bb);
+ self.phi(
+ self.val_ty(local_res),
+ &[supported_res, local_res],
+ &[supported_bb, local_bb],
+ )
+ }
+}
@@ -449,5 +449,11 @@ impl<'ll> CodegenCx<'ll, '_> {
  "__nv_ynf",
  fn(t_i32, t_f32) -> t_f32
  );
+ // Address space checks
+ ifn!(map, "llvm.nvvm.isspacep.const", fn(i8p) -> i1);
+ ifn!(map, "llvm.nvvm.isspacep.global", fn(i8p) -> i1);
+ ifn!(map, "llvm.nvvm.isspacep.local", fn(i8p) -> i1);
+ ifn!(map, "llvm.nvvm.isspacep.shared", fn(i8p) -> i1);
+ 
  }
 }
@@ -16,7 +16,7 @@
 // but likely will use in the future, so we ignore any unused functions
 // in case we need them in the future for things like debug info or LTO.
 #![allow(dead_code)]
-
+use rustc_codegen_ssa::common::AtomicRmwBinOp;
 use libc::{c_char, c_uint, c_void, size_t};
 use libc::{c_int, c_ulonglong};
 use std::ffi::{CStr, CString};
@@ -1947,4 +1947,50 @@ unsafe extern "C" {
  pub(crate) fn LLVMRustAddDereferenceableOrNullAttr(Fn: &Value, index: c_uint, bytes: u64);
 
  pub(crate) fn LLVMRustPositionBuilderAtStart<'a>(B: &Builder<'a>, BB: &'a BasicBlock);
+ // Atomics
+ pub fn LLVMRustBuildAtomicCmpXchg<'a>(
+ B: &Builder<'a>,
+ LHS: &Value,
+ CMP: &Value,
+ RHS: &Value,
+ Order: AtomicOrdering,
+ FailureOrder: AtomicOrdering,
+ Weak: Bool,
+ ) -> &'a Value;
+
+ pub fn LLVMBuildAtomicRMW<'a>(
+ B: &Builder<'a>,
+ Op: AtomicRmwBinOp,
+ LHS: &Value,
+ RHS: &Value,
+ Order: AtomicOrdering,
+ SingleThreaded: Bool,
+ ) -> &'a Value;
 }
+/// LLVMAtomicOrdering
+#[derive(Copy, Clone)]
+#[repr(C)]
+pub(crate) enum AtomicOrdering {
+ #[allow(dead_code)]
+ NotAtomic = 0,
+ #[allow(dead_code)]
+ Unordered = 1,
+ Monotonic = 2,
+ // Consume = 3, // Not specified yet.
+ Acquire = 4,
+ Release = 5,
+ AcquireRelease = 6,
+ SequentiallyConsistent = 7,
+}
+impl AtomicOrdering {
+ pub(crate) fn from_generic(ao: rustc_middle::ty::AtomicOrdering) -> Self {
+ use rustc_middle::ty::AtomicOrdering as Common;
+ match ao {
+ Common::Relaxed => Self::Monotonic,
+ Common::Acquire => Self::Acquire,
+ Common::Release => Self::Release,
+ Common::AcqRel => Self::AcquireRelease,
+ Common::SeqCst => Self::SequentiallyConsistent,
+ }
+ }
+}
@@ -0,0 +1,23 @@
+// Test CUDA atomic operations compile correctly
+// build-pass
+// compile-flags: -Z verify-llvm-ir
+use core::sync::atomic::{AtomicUsize,Ordering};
+
+use cuda_std::atomic::{
+ AtomicF32, AtomicF64, BlockAtomicF32, BlockAtomicF64, SystemAtomicF32, SystemAtomicF64,
+};
+use cuda_std::kernel;
+static GLOBAL:AtomicUsize = AtomicUsize::new(0);
+#[kernel]
+pub unsafe fn test_cuda_atomic_floats() {
+ let local = AtomicUsize::new(0);
+ // `compare_exchange` should succeed
+ local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
+ // `compare_exchange` should fail
+ local.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
+ // `compare_exchange` should succeed
+ GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
+ // `compare_exchange` should fail
+ GLOBAL.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed);
+
+}
Original file line number	Diff line number	Diff line change
`@@ -449,5 +449,11 @@ impl<'ll> CodegenCx<'ll, '_> {`
`449`	`449`	`"__nv_ynf",`
`450`	`450`	`fn(t_i32, t_f32) -> t_f32`
`451`	`451`	`);`
	`452`	`+ // Address space checks`
	`453`	`+ ifn!(map, "llvm.nvvm.isspacep.const", fn(i8p) -> i1);`
	`454`	`+ ifn!(map, "llvm.nvvm.isspacep.global", fn(i8p) -> i1);`
	`455`	`+ ifn!(map, "llvm.nvvm.isspacep.local", fn(i8p) -> i1);`
	`456`	`+ ifn!(map, "llvm.nvvm.isspacep.shared", fn(i8p) -> i1);`
	`457`	`+`
`452`	`458`	`}`
`453`	`459`	`}`