Skip to content

Commit 49a898f

Browse files
authored
[AMDGPU][gfx1250] Support "cluster" syncscope (#157641)
Defaults to "agent" for targets that do not support it. - Add documentation - Register it in MachineModuleInfo - Add MemoryLegalizer support
1 parent 7f4c297 commit 49a898f

10 files changed

+92224
-18
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,8 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
537537
- Packed
538538
work-item Add product
539539
IDs names.
540+
- Workgroup
541+
Clusters
540542

541543
=========== =============== ============ ===== ================= =============== =============== ======================
542544

@@ -1095,6 +1097,22 @@ is conservatively correct for OpenCL.
10951097
- ``wavefront`` and executed by a thread in the
10961098
same wavefront.
10971099

1100+
``cluster`` Synchronizes with, and participates in modification
1101+
and seq_cst total orderings with, other operations
1102+
(except image operations) for all address spaces
1103+
(except private, or generic that accesses private)
1104+
provided the other operation's sync scope is:
1105+
1106+
- ``system``, ``agent`` or ``cluster`` and
1107+
executed by a thread on the same cluster.
1108+
- ``workgroup`` and executed by a thread in the
1109+
same work-group.
1110+
- ``wavefront`` and executed by a thread in the
1111+
same wavefront.
1112+
1113+
On targets that do not support workgroup cluster
1114+
launch mode, this behaves like ``agent`` scope instead.
1115+
10981116
``workgroup`` Synchronizes with, and participates in modification
10991117
and seq_cst total orderings with, other operations
11001118
(except image operations) for all address spaces
@@ -1128,6 +1146,9 @@ is conservatively correct for OpenCL.
11281146
``agent-one-as`` Same as ``agent`` but only synchronizes with other
11291147
operations within the same address space.
11301148

1149+
``cluster-one-as`` Same as ``cluster`` but only synchronizes with other
1150+
operations within the same address space.
1151+
11311152
``workgroup-one-as`` Same as ``workgroup`` but only synchronizes with
11321153
other operations within the same address space.
11331154

llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
2323
AgentSSID = CTX.getOrInsertSyncScopeID("agent");
2424
WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
2525
WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
26+
ClusterSSID = CTX.getOrInsertSyncScopeID("cluster");
2627
SystemOneAddressSpaceSSID =
2728
CTX.getOrInsertSyncScopeID("one-as");
2829
AgentOneAddressSpaceSSID =
@@ -33,4 +34,5 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
3334
CTX.getOrInsertSyncScopeID("wavefront-one-as");
3435
SingleThreadOneAddressSpaceSSID =
3536
CTX.getOrInsertSyncScopeID("singlethread-one-as");
37+
ClusterOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("cluster-one-as");
3638
}

llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
3232
SyncScope::ID WorkgroupSSID;
3333
/// Wavefront synchronization scope ID (cross address space).
3434
SyncScope::ID WavefrontSSID;
35+
/// Cluster synchronization scope ID (cross address space).
36+
SyncScope::ID ClusterSSID;
3537
/// System synchronization scope ID (single address space).
3638
SyncScope::ID SystemOneAddressSpaceSSID;
3739
/// Agent synchronization scope ID (single address space).
@@ -42,6 +44,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
4244
SyncScope::ID WavefrontOneAddressSpaceSSID;
4345
/// Single thread synchronization scope ID (single address space).
4446
SyncScope::ID SingleThreadOneAddressSpaceSSID;
47+
/// Cluster synchronization scope ID (single address space).
48+
SyncScope::ID ClusterOneAddressSpaceSSID;
4549

4650
/// In AMDGPU target synchronization scopes are inclusive, meaning a
4751
/// larger synchronization scope is inclusive of a smaller synchronization
@@ -60,20 +64,24 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
6064
else if (SSID == getWorkgroupSSID() ||
6165
SSID == getWorkgroupOneAddressSpaceSSID())
6266
return 2;
67+
else if (SSID == getClusterSSID() ||
68+
SSID == getClusterOneAddressSpaceSSID())
69+
return 3;
6370
else if (SSID == getAgentSSID() ||
6471
SSID == getAgentOneAddressSpaceSSID())
65-
return 3;
72+
return 4;
6673
else if (SSID == SyncScope::System ||
6774
SSID == getSystemOneAddressSpaceSSID())
68-
return 4;
75+
return 5;
6976

7077
return std::nullopt;
7178
}
7279

7380
/// \returns True if \p SSID is restricted to single address space, false
7481
/// otherwise
7582
bool isOneAddressSpace(SyncScope::ID SSID) const {
76-
return SSID == getSingleThreadOneAddressSpaceSSID() ||
83+
return SSID == getClusterOneAddressSpaceSSID() ||
84+
SSID == getSingleThreadOneAddressSpaceSSID() ||
7785
SSID == getWavefrontOneAddressSpaceSSID() ||
7886
SSID == getWorkgroupOneAddressSpaceSSID() ||
7987
SSID == getAgentOneAddressSpaceSSID() ||
@@ -95,6 +103,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
95103
SyncScope::ID getWavefrontSSID() const {
96104
return WavefrontSSID;
97105
}
106+
/// \returns Cluster synchronization scope ID (cross address space).
107+
SyncScope::ID getClusterSSID() const { return ClusterSSID; }
98108
/// \returns System synchronization scope ID (single address space).
99109
SyncScope::ID getSystemOneAddressSpaceSSID() const {
100110
return SystemOneAddressSpaceSSID;
@@ -115,6 +125,10 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
115125
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
116126
return SingleThreadOneAddressSpaceSSID;
117127
}
128+
/// \returns Single thread synchronization scope ID (single address space).
129+
SyncScope::ID getClusterOneAddressSpaceSSID() const {
130+
return ClusterOneAddressSpaceSSID;
131+
}
118132

119133
/// In AMDGPU target synchronization scopes are inclusive, meaning a
120134
/// larger synchronization scope is inclusive of a smaller synchronization

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1833,6 +1833,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18331833
return GFX1250Insts && getGeneration() == GFX12;
18341834
}
18351835

1836+
/// \returns true if the subtarget supports clusters of workgroups.
1837+
bool hasClusters() const { return GFX1250Insts; }
1838+
18361839
/// \returns true if the subtarget requires a wait for xcnt before atomic
18371840
/// flat/global stores & rmw.
18381841
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ enum class SIAtomicScope {
6363
SINGLETHREAD,
6464
WAVEFRONT,
6565
WORKGROUP,
66+
CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
6667
AGENT,
6768
SYSTEM
6869
};
@@ -106,6 +107,7 @@ class SIMemOpInfo final {
106107
bool IsCooperative = false;
107108

108109
SIMemOpInfo(
110+
const GCNSubtarget &ST,
109111
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
110112
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
111113
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
@@ -156,6 +158,11 @@ class SIMemOpInfo final {
156158
SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
157159
this->Scope = std::min(Scope, SIAtomicScope::AGENT);
158160
}
161+
162+
// On targets that have no concept of a workgroup cluster, use
163+
// AGENT scope as a conservatively correct alternative.
164+
if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
165+
this->Scope = SIAtomicScope::AGENT;
159166
}
160167

161168
public:
@@ -225,6 +232,7 @@ class SIMemOpInfo final {
225232
class SIMemOpAccess final {
226233
private:
227234
const AMDGPUMachineModuleInfo *MMI = nullptr;
235+
const GCNSubtarget &ST;
228236

229237
/// Reports unsupported message \p Msg for \p MI to LLVM context.
230238
void reportUnsupported(const MachineBasicBlock::iterator &MI,
@@ -248,7 +256,7 @@ class SIMemOpAccess final {
248256
public:
249257
/// Construct class to support accessing the machine memory operands
250258
/// of instructions in the machine function \p MF.
251-
SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
259+
SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
252260

253261
/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
254262
std::optional<SIMemOpInfo>
@@ -773,6 +781,8 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
773781
return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
774782
if (SSID == MMI->getAgentSSID())
775783
return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
784+
if (SSID == MMI->getClusterSSID())
785+
return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
776786
if (SSID == MMI->getWorkgroupSSID())
777787
return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
778788
true);
@@ -788,6 +798,9 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
788798
if (SSID == MMI->getAgentOneAddressSpaceSSID())
789799
return std::tuple(SIAtomicScope::AGENT,
790800
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
801+
if (SSID == MMI->getClusterOneAddressSpaceSSID())
802+
return std::tuple(SIAtomicScope::CLUSTER,
803+
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
791804
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
792805
return std::tuple(SIAtomicScope::WORKGROUP,
793806
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
@@ -815,8 +828,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
815828
return SIAtomicAddrSpace::OTHER;
816829
}
817830

818-
SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
819-
: MMI(&MMI_) {}
831+
SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
832+
const GCNSubtarget &ST)
833+
: MMI(&MMI_), ST(ST) {}
820834

821835
std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
822836
const MachineBasicBlock::iterator &MI) const {
@@ -877,7 +891,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
877891
return std::nullopt;
878892
}
879893
}
880-
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
894+
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
881895
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
882896
IsNonTemporal, IsLastUse, IsCooperative);
883897
}
@@ -891,7 +905,7 @@ SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
891905

892906
// Be conservative if there are no memory operands.
893907
if (MI->getNumMemOperands() == 0)
894-
return SIMemOpInfo();
908+
return SIMemOpInfo(ST);
895909

896910
return constructFromMIWithMMO(MI);
897911
}
@@ -905,7 +919,7 @@ SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
905919

906920
// Be conservative if there are no memory operands.
907921
if (MI->getNumMemOperands() == 0)
908-
return SIMemOpInfo();
922+
return SIMemOpInfo(ST);
909923

910924
return constructFromMIWithMMO(MI);
911925
}
@@ -946,8 +960,9 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
946960
if (SynchronizeAS)
947961
OrderingAddrSpace = *SynchronizeAS;
948962

949-
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
950-
IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
963+
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
964+
SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
965+
AtomicOrdering::NotAtomic);
951966
}
952967

953968
std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
@@ -959,7 +974,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
959974

960975
// Be conservative if there are no memory operands.
961976
if (MI->getNumMemOperands() == 0)
962-
return SIMemOpInfo();
977+
return SIMemOpInfo(ST);
963978

964979
return constructFromMIWithMMO(MI);
965980
}
@@ -2377,6 +2392,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
23772392
switch (Scope) {
23782393
case SIAtomicScope::SYSTEM:
23792394
case SIAtomicScope::AGENT:
2395+
case SIAtomicScope::CLUSTER:
23802396
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
23812397
LOADCnt |= true;
23822398
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2413,6 +2429,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
24132429
switch (Scope) {
24142430
case SIAtomicScope::SYSTEM:
24152431
case SIAtomicScope::AGENT:
2432+
case SIAtomicScope::CLUSTER:
24162433
case SIAtomicScope::WORKGROUP:
24172434
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
24182435
// not needed as LDS operations for all waves are executed in a total
@@ -2495,6 +2512,9 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
24952512
case SIAtomicScope::AGENT:
24962513
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
24972514
break;
2515+
case SIAtomicScope::CLUSTER:
2516+
ScopeImm = AMDGPU::CPol::SCOPE_SE;
2517+
break;
24982518
case SIAtomicScope::WORKGROUP:
24992519
// GFX12.0:
25002520
// In WGP mode the waves of a work-group can be executing on either CU of
@@ -2565,6 +2585,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
25652585
.addImm(AMDGPU::CPol::SCOPE_DEV);
25662586
}
25672587
break;
2588+
case SIAtomicScope::CLUSTER:
25682589
case SIAtomicScope::WORKGROUP:
25692590
// No WB necessary, but we still have to wait.
25702591
break;
@@ -2649,11 +2670,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
26492670
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
26502671

26512672
// GFX12.0 only: Extra waits needed before system scope stores.
2652-
if (!ST.hasGFX1250Insts()) {
2653-
if (!Atomic && Scope == CPol::SCOPE_SYS)
2654-
return insertWaitsBeforeSystemScopeStore(MI);
2655-
return Changed;
2656-
}
2673+
if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
2674+
Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
26572675

26582676
return Changed;
26592677
}
@@ -2684,6 +2702,9 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
26842702
case SIAtomicScope::AGENT:
26852703
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
26862704
break;
2705+
case SIAtomicScope::CLUSTER:
2706+
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2707+
break;
26872708
case SIAtomicScope::WORKGROUP:
26882709
// In workgroup mode, SCOPE_SE is needed as waves can executes on
26892710
// different CUs that access different L0s.
@@ -2930,8 +2951,8 @@ SIMemoryLegalizerPass::run(MachineFunction &MF,
29302951
bool SIMemoryLegalizer::run(MachineFunction &MF) {
29312952
bool Changed = false;
29322953

2933-
SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
29342954
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2955+
SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
29352956
CC = SICacheControl::create(ST);
29362957

29372958
for (auto &MBB : MF) {

0 commit comments

Comments
 (0)