- Notifications
You must be signed in to change notification settings - Fork 15.1k
Revert "[AMDGPU][gfx1250] Add cu-store subtarget feature (#150588)" #157639
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model. Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model.
This reverts commit be17791.
| @llvm/pr-subscribers-llvm-binary-utilities @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesThis reverts commit be17791. Full diff: https://github.com/llvm/llvm-project/pull/157639.diff 13 Files Affected:
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index afd0d9e7539ef..2cddc3365d5d7 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -768,9 +768,6 @@ For example: performant than code generated for XNACK replay disabled. - cu-stores TODO On GFX12.5, controls whether ``scope:SCOPE_CU`` stores may be used. - If disabled, all stores will be done at ``scope:SCOPE_SE`` or greater. - =============== ============================ ================================================== .. _amdgpu-target-id: @@ -5114,9 +5111,7 @@ The fields used by CP for code objects before V3 also match those specified in and must be 0, >454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT _SIZE - 455 1 bit USES_CU_STORES GFX12.5: Whether the ``cu-stores`` target attribute is enabled. - If 0, then all stores are ``SCOPE_SE`` or higher. - 457:456 2 bits Reserved, must be 0. + 457:455 3 bits Reserved, must be 0. 458 1 bit ENABLE_WAVEFRONT_SIZE32 GFX6-GFX9 Reserved, must be 0. GFX10-GFX11 @@ -18254,8 +18249,6 @@ terminated by an ``.end_amdhsa_kernel`` directive. GFX942) ``.amdhsa_user_sgpr_private_segment_size`` 0 GFX6-GFX12 Controls ENABLE_SGPR_PRIVATE_SEGMENT_SIZE in :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. - ``.amdhsa_uses_cu_stores`` 0 GFX12.5 Controls USES_CU_STORES in - :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_wavefront_size32`` Target GFX10-GFX12 Controls ENABLE_WAVEFRONT_SIZE32 in Feature :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. Specific diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h index fb9d68428cf18..418d2b36114c5 100644 --- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -252,8 +252,7 @@ enum : int32_t { KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_ID, 4, 1), KERNEL_CODE_PROPERTY(ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), - KERNEL_CODE_PROPERTY(RESERVED0, 7, 2), - KERNEL_CODE_PROPERTY(USES_CU_STORES, 9, 1), // GFX12.5 +cu-stores + KERNEL_CODE_PROPERTY(RESERVED0, 7, 3), KERNEL_CODE_PROPERTY(ENABLE_WAVEFRONT_SIZE32, 10, 1), // GFX10+ KERNEL_CODE_PROPERTY(USES_DYNAMIC_STACK, 11, 1), KERNEL_CODE_PROPERTY(RESERVED1, 12, 4), diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index a366db1c580ba..ffbda14dcd849 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -289,12 +289,6 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", "VMEM CU scope prefetches do not fail on illegal address" >; -def FeatureCUStores : SubtargetFeature<"cu-stores", - "HasCUStores", - "true", - "Whether SCOPE_CU stores can be used on GFX12.5" ->; - def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -2042,7 +2036,6 @@ def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureRequiresAlignedVGPRs, - FeatureCUStores, FeatureAddressableLocalMemorySize327680, FeatureCuMode, Feature1024AddressableVGPRs, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 1fff188c75819..29f8f9bc8b54c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -557,7 +557,6 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= @@ -587,13 +586,10 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; } - if (ST.isWave32()) { + if (MF.getSubtarget<GCNSubtarget>().isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } - if (isGFX1250(ST) && ST.hasCUStores()) { - KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES; - } // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be // un-evaluatable at this point so it cannot be conditionally checked here. diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2ce1e9e410b23..e420f2ad676f9 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -6181,12 +6181,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { ExprVal, ValRange); if (Val) ImpliedUserSGPRCount += 1; - } else if (ID == ".amdhsa_uses_cu_stores") { - if (!isGFX1250()) - return Error(IDRange.Start, "directive requires gfx12.5", IDRange); - - PARSE_BITS_ENTRY(KD.kernel_code_properties, - KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange); } else if (ID == ".amdhsa_wavefront_size32") { EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 10) diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index b50b2a2e6e23c..6f6039bf4ec21 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2639,9 +2639,6 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); - if (isGFX1250()) - PRINT_DIRECTIVE(".amdhsa_uses_cu_stores", - KERNEL_CODE_PROPERTY_USES_CU_STORES); if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 556ec683f2ec6..e172a9c699fb1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -252,7 +252,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; bool HasSafeCUPrefetch = false; - bool HasCUStores = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -1017,8 +1016,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } - bool hasCUStores() const { return HasCUStores; } - // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 0bbab29dbda18..ff6a21239345d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -448,11 +448,6 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, ".amdhsa_user_sgpr_private_segment_size"); - if (isGFX1250(STI)) - PrintField(KD.kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT, - amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES, - ".amdhsa_uses_cu_stores"); if (IVersion.Major >= 10) PrintField(KD.kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c20fcacb8fb26..f61c0d8f84b29 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2657,9 +2657,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - // We also require SCOPE_SE minimum if we not have the "cu-stores" feature. - if (Scope == CPol::SCOPE_CU && - (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) + if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) return setScope(MI, CPol::SCOPE_SE); return Changed; diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll deleted file mode 100644 index fcdba69c30213..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll +++ /dev/null @@ -1,88 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 -mattr=-cu-stores < %s | FileCheck --check-prefixes=GCN,NOCU %s - -; Check that if -cu-stores is used, we use SCOPE_SE minimum on all stores. - -; GCN: flat_store: -; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel flat_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @flat_store(ptr %dst, i32 %val) { -entry: - store i32 %val, ptr %dst - ret void -} - -; GCN: global_store: -; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}} -; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel global_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @global_store(ptr addrspace(1) %dst, i32 %val) { -entry: - store i32 %val, ptr addrspace(1) %dst - ret void -} - -; GCN: local_store: -; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; GCN: .amdhsa_kernel local_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @local_store(ptr addrspace(3) %dst, i32 %val) { -entry: - store i32 %val, ptr addrspace(3) %dst - ret void -} - -; GCN: scratch_store: -; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel scratch_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @scratch_store(ptr addrspace(5) %dst, i32 %val) { -entry: - store i32 %val, ptr addrspace(5) %dst - ret void -} - -; GCN: flat_atomic_store: -; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel flat_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @flat_atomic_store(ptr %dst, i32 %val) { -entry: - store atomic i32 %val, ptr %dst syncscope("wavefront") unordered, align 4 - ret void -} - -; GCN: global_atomic_store: -; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}} -; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel global_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @global_atomic_store(ptr addrspace(1) %dst, i32 %val) { -entry: - store atomic i32 %val, ptr addrspace(1) %dst syncscope("wavefront") unordered, align 4 - ret void -} - -; GCN: local_atomic_store: -; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; GCN: .amdhsa_kernel local_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @local_atomic_store(ptr addrspace(3) %dst, i32 %val) { -entry: - store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4 - ret void -} diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s index e19be77e901fa..3c693610bee51 100644 --- a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s @@ -40,7 +40,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 0c000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00c00000 -// OBJDUMP-NEXT: 0070 005021c4 410f007f 5e068200 00000000 +// OBJDUMP-NEXT: 0070 005021c4 410f007f 5e048200 00000000 // special_sgpr // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 @@ -127,7 +127,6 @@ max_vgprs: .amdhsa_user_sgpr_kernarg_preload_length 2 .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 - .amdhsa_uses_cu_stores 1 .amdhsa_wavefront_size32 1 .amdhsa_enable_private_segment 1 .amdhsa_system_sgpr_workgroup_id_x 0 @@ -168,7 +167,6 @@ max_vgprs: // ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 2 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 -// ASM-NEXT: .amdhsa_uses_cu_stores 1 // ASM-NEXT: .amdhsa_wavefront_size32 1 // ASM-NEXT: .amdhsa_enable_private_segment 1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 diff --git a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test index 369005f4ea432..fdca11b95caa6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test +++ b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test @@ -13,10 +13,10 @@ # RES_4_2: ; error decoding test.kd: kernel descriptor reserved bits in range (511:480) set # RES_4_2-NEXT: ; decoding failed region as bytes -# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003000000000000 \ -# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_456 -# RES_456: ; error decoding test.kd: kernel descriptor reserved bits in range (456:455) set -# RES_456-NEXT: ; decoding failed region as bytes +# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000006000000000000 \ +# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_457 +# RES_457: ; error decoding test.kd: kernel descriptor reserved bits in range (457:455) set +# RES_457-NEXT: ; decoding failed region as bytes # RUN: yaml2obj %s -DGPU=GFX90A -DKD=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000c000000000000 \ # RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=WF32 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s index 99a4df3e5adfb..3e96ea3c67380 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s @@ -49,7 +49,6 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; CHECK-NEXT: .amdhsa_uses_cu_stores 1 ; CHECK-NEXT: .amdhsa_wavefront_size32 1 ; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0 ; CHECK-NEXT: .end_amdhsa_kernel @@ -57,7 +56,6 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_inst_pref_size 0 - .amdhsa_uses_cu_stores 1 .end_amdhsa_kernel ;--- 2.s @@ -107,7 +105,6 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; CHECK-NEXT: .amdhsa_uses_cu_stores 0 ; CHECK-NEXT: .amdhsa_wavefront_size32 1 ; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0 ; CHECK-NEXT: .end_amdhsa_kernel @@ -116,6 +113,5 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_named_barrier_count 7 - .amdhsa_uses_cu_stores 0 .amdhsa_inst_pref_size 63 .end_amdhsa_kernel |
| @llvm/pr-subscribers-llvm-support Author: Pierre van Houtryve (Pierre-vh) ChangesThis reverts commit be17791. Full diff: https://github.com/llvm/llvm-project/pull/157639.diff 13 Files Affected:
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index afd0d9e7539ef..2cddc3365d5d7 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -768,9 +768,6 @@ For example: performant than code generated for XNACK replay disabled. - cu-stores TODO On GFX12.5, controls whether ``scope:SCOPE_CU`` stores may be used. - If disabled, all stores will be done at ``scope:SCOPE_SE`` or greater. - =============== ============================ ================================================== .. _amdgpu-target-id: @@ -5114,9 +5111,7 @@ The fields used by CP for code objects before V3 also match those specified in and must be 0, >454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT _SIZE - 455 1 bit USES_CU_STORES GFX12.5: Whether the ``cu-stores`` target attribute is enabled. - If 0, then all stores are ``SCOPE_SE`` or higher. - 457:456 2 bits Reserved, must be 0. + 457:455 3 bits Reserved, must be 0. 458 1 bit ENABLE_WAVEFRONT_SIZE32 GFX6-GFX9 Reserved, must be 0. GFX10-GFX11 @@ -18254,8 +18249,6 @@ terminated by an ``.end_amdhsa_kernel`` directive. GFX942) ``.amdhsa_user_sgpr_private_segment_size`` 0 GFX6-GFX12 Controls ENABLE_SGPR_PRIVATE_SEGMENT_SIZE in :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. - ``.amdhsa_uses_cu_stores`` 0 GFX12.5 Controls USES_CU_STORES in - :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_wavefront_size32`` Target GFX10-GFX12 Controls ENABLE_WAVEFRONT_SIZE32 in Feature :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. Specific diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h index fb9d68428cf18..418d2b36114c5 100644 --- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -252,8 +252,7 @@ enum : int32_t { KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_ID, 4, 1), KERNEL_CODE_PROPERTY(ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), - KERNEL_CODE_PROPERTY(RESERVED0, 7, 2), - KERNEL_CODE_PROPERTY(USES_CU_STORES, 9, 1), // GFX12.5 +cu-stores + KERNEL_CODE_PROPERTY(RESERVED0, 7, 3), KERNEL_CODE_PROPERTY(ENABLE_WAVEFRONT_SIZE32, 10, 1), // GFX10+ KERNEL_CODE_PROPERTY(USES_DYNAMIC_STACK, 11, 1), KERNEL_CODE_PROPERTY(RESERVED1, 12, 4), diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index a366db1c580ba..ffbda14dcd849 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -289,12 +289,6 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", "VMEM CU scope prefetches do not fail on illegal address" >; -def FeatureCUStores : SubtargetFeature<"cu-stores", - "HasCUStores", - "true", - "Whether SCOPE_CU stores can be used on GFX12.5" ->; - def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -2042,7 +2036,6 @@ def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureRequiresAlignedVGPRs, - FeatureCUStores, FeatureAddressableLocalMemorySize327680, FeatureCuMode, Feature1024AddressableVGPRs, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 1fff188c75819..29f8f9bc8b54c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -557,7 +557,6 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= @@ -587,13 +586,10 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; } - if (ST.isWave32()) { + if (MF.getSubtarget<GCNSubtarget>().isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } - if (isGFX1250(ST) && ST.hasCUStores()) { - KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES; - } // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be // un-evaluatable at this point so it cannot be conditionally checked here. diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2ce1e9e410b23..e420f2ad676f9 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -6181,12 +6181,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { ExprVal, ValRange); if (Val) ImpliedUserSGPRCount += 1; - } else if (ID == ".amdhsa_uses_cu_stores") { - if (!isGFX1250()) - return Error(IDRange.Start, "directive requires gfx12.5", IDRange); - - PARSE_BITS_ENTRY(KD.kernel_code_properties, - KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange); } else if (ID == ".amdhsa_wavefront_size32") { EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 10) diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index b50b2a2e6e23c..6f6039bf4ec21 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2639,9 +2639,6 @@ Expected<bool> AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); - if (isGFX1250()) - PRINT_DIRECTIVE(".amdhsa_uses_cu_stores", - KERNEL_CODE_PROPERTY_USES_CU_STORES); if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 556ec683f2ec6..e172a9c699fb1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -252,7 +252,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; bool HasSafeCUPrefetch = false; - bool HasCUStores = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -1017,8 +1016,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } - bool hasCUStores() const { return HasCUStores; } - // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 0bbab29dbda18..ff6a21239345d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -448,11 +448,6 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, ".amdhsa_user_sgpr_private_segment_size"); - if (isGFX1250(STI)) - PrintField(KD.kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT, - amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES, - ".amdhsa_uses_cu_stores"); if (IVersion.Major >= 10) PrintField(KD.kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c20fcacb8fb26..f61c0d8f84b29 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2657,9 +2657,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - // We also require SCOPE_SE minimum if we not have the "cu-stores" feature. - if (Scope == CPol::SCOPE_CU && - (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) + if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) return setScope(MI, CPol::SCOPE_SE); return Changed; diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll deleted file mode 100644 index fcdba69c30213..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll +++ /dev/null @@ -1,88 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 -mattr=-cu-stores < %s | FileCheck --check-prefixes=GCN,NOCU %s - -; Check that if -cu-stores is used, we use SCOPE_SE minimum on all stores. - -; GCN: flat_store: -; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel flat_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @flat_store(ptr %dst, i32 %val) { -entry: - store i32 %val, ptr %dst - ret void -} - -; GCN: global_store: -; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}} -; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel global_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @global_store(ptr addrspace(1) %dst, i32 %val) { -entry: - store i32 %val, ptr addrspace(1) %dst - ret void -} - -; GCN: local_store: -; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; GCN: .amdhsa_kernel local_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @local_store(ptr addrspace(3) %dst, i32 %val) { -entry: - store i32 %val, ptr addrspace(3) %dst - ret void -} - -; GCN: scratch_store: -; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel scratch_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @scratch_store(ptr addrspace(5) %dst, i32 %val) { -entry: - store i32 %val, ptr addrspace(5) %dst - ret void -} - -; GCN: flat_atomic_store: -; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel flat_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @flat_atomic_store(ptr %dst, i32 %val) { -entry: - store atomic i32 %val, ptr %dst syncscope("wavefront") unordered, align 4 - ret void -} - -; GCN: global_atomic_store: -; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}} -; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel global_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @global_atomic_store(ptr addrspace(1) %dst, i32 %val) { -entry: - store atomic i32 %val, ptr addrspace(1) %dst syncscope("wavefront") unordered, align 4 - ret void -} - -; GCN: local_atomic_store: -; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; GCN: .amdhsa_kernel local_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @local_atomic_store(ptr addrspace(3) %dst, i32 %val) { -entry: - store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4 - ret void -} diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s index e19be77e901fa..3c693610bee51 100644 --- a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s @@ -40,7 +40,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 0c000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00c00000 -// OBJDUMP-NEXT: 0070 005021c4 410f007f 5e068200 00000000 +// OBJDUMP-NEXT: 0070 005021c4 410f007f 5e048200 00000000 // special_sgpr // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 @@ -127,7 +127,6 @@ max_vgprs: .amdhsa_user_sgpr_kernarg_preload_length 2 .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 - .amdhsa_uses_cu_stores 1 .amdhsa_wavefront_size32 1 .amdhsa_enable_private_segment 1 .amdhsa_system_sgpr_workgroup_id_x 0 @@ -168,7 +167,6 @@ max_vgprs: // ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 2 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 -// ASM-NEXT: .amdhsa_uses_cu_stores 1 // ASM-NEXT: .amdhsa_wavefront_size32 1 // ASM-NEXT: .amdhsa_enable_private_segment 1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 diff --git a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test index 369005f4ea432..fdca11b95caa6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test +++ b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test @@ -13,10 +13,10 @@ # RES_4_2: ; error decoding test.kd: kernel descriptor reserved bits in range (511:480) set # RES_4_2-NEXT: ; decoding failed region as bytes -# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003000000000000 \ -# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_456 -# RES_456: ; error decoding test.kd: kernel descriptor reserved bits in range (456:455) set -# RES_456-NEXT: ; decoding failed region as bytes +# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000006000000000000 \ +# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_457 +# RES_457: ; error decoding test.kd: kernel descriptor reserved bits in range (457:455) set +# RES_457-NEXT: ; decoding failed region as bytes # RUN: yaml2obj %s -DGPU=GFX90A -DKD=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000c000000000000 \ # RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=WF32 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s index 99a4df3e5adfb..3e96ea3c67380 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s @@ -49,7 +49,6 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; CHECK-NEXT: .amdhsa_uses_cu_stores 1 ; CHECK-NEXT: .amdhsa_wavefront_size32 1 ; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0 ; CHECK-NEXT: .end_amdhsa_kernel @@ -57,7 +56,6 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_inst_pref_size 0 - .amdhsa_uses_cu_stores 1 .end_amdhsa_kernel ;--- 2.s @@ -107,7 +105,6 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; CHECK-NEXT: .amdhsa_uses_cu_stores 0 ; CHECK-NEXT: .amdhsa_wavefront_size32 1 ; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0 ; CHECK-NEXT: .end_amdhsa_kernel @@ -116,6 +113,5 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_named_barrier_count 7 - .amdhsa_uses_cu_stores 0 .amdhsa_inst_pref_size 63 .end_amdhsa_kernel |
| Why do we want to revert it? Can you put it into the description as well? |
It's not a feature we need anymore for gfx1250. I updated the description |
Merge activity
|

This reverts commit be17791.
This is not necessary for gfx1250 anymore.