@@ -1134,16 +1134,64 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
11341134 // Atomic Operations
11351135 fn atomic_cmpxchg (
11361136 & mut self ,
1137- _dst : & ' ll Value ,
1138- _cmp : & ' ll Value ,
1139- _src : & ' ll Value ,
1140- _order : AtomicOrdering ,
1141- _failure_order : AtomicOrdering ,
1142- _weak : bool ,
1137+ dst : & ' ll Value ,
1138+ cmp : & ' ll Value ,
1139+ src : & ' ll Value ,
1140+ order : AtomicOrdering ,
1141+ failure_order : AtomicOrdering ,
1142+ weak : bool ,
11431143 ) -> ( & ' ll Value , & ' ll Value ) {
1144- // allowed but only for some things and with restrictions
1145- // https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#cmpxchg-instruction
1146- self . fatal ( "atomic cmpxchg is not supported" )
1144+ // LLVM verifier rejects cases where the `failure_order` is stronger than `order`
1145+ match ( order, failure_order) {
1146+ ( AtomicOrdering :: SeqCst , _) =>( ) ,
1147+ ( _, AtomicOrdering :: Relaxed ) =>( ) ,
1148+ ( AtomicOrdering :: Release , AtomicOrdering :: Release ) | ( AtomicOrdering :: Release , AtomicOrdering :: Acquire ) | ( AtomicOrdering :: Acquire , AtomicOrdering :: Acquire ) =>( ) ,
1149+ ( AtomicOrdering :: AcqRel , AtomicOrdering :: Acquire ) => ( ) ,
1150+ ( AtomicOrdering :: Relaxed , _) | ( _, AtomicOrdering :: Release | AtomicOrdering :: AcqRel | AtomicOrdering :: SeqCst ) =>{
1151+ // Invalid cmpxchg - `failure_order` is stronger than `order`! So, we abort.
1152+ self . abort ( ) ;
1153+ return ( self . const_undef ( self . val_ty ( cmp) ) , self . const_undef ( self . type_i1 ( ) ) ) ;
1154+ }
1155+ } ;
1156+ let res = self . atomic_op (
1157+ dst,
1158+ |builder, dst| {
1159+ // We are in a supported address space - just use ordinary atomics
1160+ unsafe {
1161+ llvm:: LLVMRustBuildAtomicCmpXchg (
1162+ builder. llbuilder ,
1163+ dst,
1164+ cmp,
1165+ src,
1166+ crate :: llvm:: AtomicOrdering :: from_generic ( order) ,
1167+ crate :: llvm:: AtomicOrdering :: from_generic ( failure_order) ,
1168+ weak as u32 ,
1169+ )
1170+ }
1171+ } ,
1172+ |builder, dst| {
1173+ // Local space is only accessible to the current thread.
1174+ // So, there are no synchronization issues, and we can emulate it using a simple load / compare / store.
1175+ let load: & ' ll Value = unsafe { llvm:: LLVMBuildLoad ( builder. llbuilder , dst, UNNAMED ) } ;
1176+ let compare = builder. icmp ( IntPredicate :: IntEQ , load, cmp) ;
1177+ // We can do something smart & branchless here:
1178+ // We select either the current value(if the comparison fails), or a new value.
1179+ // We then *undconditionally* write that back to local memory(which is very, very cheap).
1180+ // TODO: measure if this has a positive impact, or if we should just use more blocks, and conditional writes.
1181+ let value = builder. select ( compare, src, load) ;
1182+ unsafe { llvm:: LLVMBuildStore ( builder. llbuilder , value, dst) } ;
1183+ let res_type = builder. type_struct ( & [ builder. val_ty ( cmp) , builder. type_ix ( 1 ) ] , false ) ;
1184+ // We pack the result, to match the behaviour of proper atomics / emulated thread-local atomics.
1185+ let res = builder. const_undef ( res_type) ;
1186+ let res = builder. insert_value ( res, load, 0 ) ;
1187+ let res = builder. insert_value ( res, compare, 1 ) ;
1188+ res
1189+ } ,
1190+ ) ;
1191+ // Unpack the result
1192+ let val = self . extract_value ( res, 0 ) ;
1193+ let success = self . extract_value ( res, 1 ) ;
1194+ ( val, success)
11471195 }
11481196 fn atomic_rmw (
11491197 & mut self ,
@@ -1609,3 +1657,98 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
16091657 }
16101658 }
16111659}
1660+ impl < ' ll , ' tcx , ' a > Builder < ' a , ' ll , ' tcx > {
1661+ /// Implements a standard atomic, using LLVM intrinsics(in `atomic_supported`, if `dst` is in a supported address space)
1662+ /// or emulation(with `emulate_local`, if `dst` points to a thread-local address space).
1663+ fn atomic_op (
1664+ & mut self ,
1665+ dst : & ' ll Value ,
1666+ atomic_supported : impl FnOnce ( & mut Builder < ' a , ' ll , ' tcx > , & ' ll Value ) -> & ' ll Value ,
1667+ emulate_local : impl FnOnce ( & mut Builder < ' a , ' ll , ' tcx > , & ' ll Value ) -> & ' ll Value ,
1668+ ) -> & ' ll Value {
1669+ // (FractalFir) Atomics in CUDA have some limitations, and we have to work around them.
1670+ // For example, they are restricted in what address space they operate on.
1671+ // CUDA has 4 address spaces(and a generic one, which is an union of all of those).
1672+ // An atomic instruction can soundly operate on:
1673+ // 1. The global address space
1674+ // 2. The shared(cluster) address space.
1675+ // It can't operate on:
1676+ // 1. The const address space(atomics on consts are UB anyway)
1677+ // 2. The thread address space(which should be only accessible to 1 thread, anyway?)
1678+ // So, we do the following:
1679+ // 1. Check if the pointer is in one of the address spaces atomics support.
1680+ // a) if so, we perform an atomic operation
1681+ // 2. Check if the pointer is in the thread-local address space. If it is, we use non-atomic ops here,
1682+ // **ASSUMING** only the current thread can access thread-local memory. (FIXME: is this sound?)
1683+ // 3. If the pointer is not in a supported address space, and is not thread-local, then we bail, and trap.
1684+
1685+ // We check if the `dst` pointer is in the `global` address space.
1686+ let ( isspacep_global_ty, isspacep_global_fn) =
1687+ self . get_intrinsic ( "llvm.nvvm.isspacep.global" ) ;
1688+ let isspacep_global = self . call (
1689+ isspacep_global_ty,
1690+ None ,
1691+ None ,
1692+ isspacep_global_fn,
1693+ & [ dst] ,
1694+ None ,
1695+ None ,
1696+ ) ;
1697+ // We check if the `dst` pointer is in the `shared` address space.
1698+ let ( isspacep_shared_ty, isspacep_shared_fn) =
1699+ self . get_intrinsic ( "llvm.nvvm.isspacep.shared" ) ;
1700+ let isspacep_shared = self . call (
1701+ isspacep_shared_ty,
1702+ None ,
1703+ None ,
1704+ isspacep_shared_fn,
1705+ & [ dst] ,
1706+ None ,
1707+ None ,
1708+ ) ;
1709+ // Combine those to check if we are in a supported address space.
1710+ let atomic_supported_addrspace = self . or ( isspacep_shared, isspacep_global) ;
1711+ // We create 2 blocks here: one we branch to if atomic is in the right address space, and one we branch to otherwise.
1712+ let supported_bb = self . append_sibling_block ( "atomic_space_supported" ) ;
1713+ let unsupported_bb = self . append_sibling_block ( "atomic_space_unsupported" ) ;
1714+ self . cond_br ( atomic_supported_addrspace, supported_bb, unsupported_bb) ;
1715+ // We also create a "merge" block we will jump to, after the the atomic ops finish.
1716+ let merge_bb = self . append_sibling_block ( "atomic_op_done" ) ;
1717+ // Execute atomic op if supported, then jump to merge
1718+ self . switch_to_block ( supported_bb) ;
1719+ let supported_res = atomic_supported ( self , dst) ;
1720+ self . br ( merge_bb) ;
1721+ // Check if the pointer is in the thread space. If so, we can emulate it.
1722+ self . switch_to_block ( unsupported_bb) ;
1723+ let ( isspacep_local_ty, isspacep_local_fn) = self . get_intrinsic ( "llvm.nvvm.isspacep.local" ) ;
1724+ let isspacep_local = self . call (
1725+ isspacep_local_ty,
1726+ None ,
1727+ None ,
1728+ isspacep_local_fn,
1729+ & [ dst] ,
1730+ None ,
1731+ None ,
1732+ ) ;
1733+ let local_bb = self . append_sibling_block ( "atomic_local_space" ) ;
1734+ let atomic_ub_bb = self . append_sibling_block ( "atomic_space_ub" ) ;
1735+ self . cond_br ( isspacep_local, local_bb, atomic_ub_bb) ;
1736+ // The pointer is in the thread(local) space.
1737+ self . switch_to_block ( local_bb) ;
1738+ let local_res = emulate_local ( self , dst) ;
1739+ self . br ( merge_bb) ;
1740+ // The pointer is neither in the supported address space, nor the local space.
1741+ // This is very likely UB. So, we trap here.
1742+ // TODO: should we print some kind of a message here? NVVM supports printf.
1743+ self . switch_to_block ( atomic_ub_bb) ;
1744+ self . abort ( ) ;
1745+ self . unreachable ( ) ;
1746+ // Atomic is impl has finished, and we can now switch to the merge_bb
1747+ self . switch_to_block ( merge_bb) ;
1748+ self . phi (
1749+ self . val_ty ( local_res) ,
1750+ & [ supported_res, local_res] ,
1751+ & [ supported_bb, local_bb] ,
1752+ )
1753+ }
1754+ }
0 commit comments