- Notifications
You must be signed in to change notification settings - Fork 15.1k
[AMDGPU][gfx1250] Remove SCOPE_SE for scratch stores #157640
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
This was referenced Sep 9, 2025
Contributor Author
This was referenced Sep 9, 2025
Member
| @llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesPatch is 698.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157640.diff 29 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index f61c0d8f84b29..0be6a9d09379f 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -2655,11 +2655,6 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { return Changed; } - // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address - // space. - if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) - return setScope(MI, CPol::SCOPE_SE); - return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index e886ea4fc6ac6..7e297f46a780e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -73,7 +73,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -192,7 +192,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -311,7 +311,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -429,7 +429,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -547,7 +547,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -666,7 +666,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -785,7 +785,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -903,7 +903,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -1021,7 +1021,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -1140,7 +1140,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0) @@ -1259,7 +1259,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -1377,7 +1377,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr ; GFX1250: ; %bb.0: ; %main_body ; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] ; GFX1250-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index 4bb2a13d02cc7..ef52694910da3 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -572,7 +572,7 @@ define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB34_5: ; %Flow1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1 @@ -634,7 +634,7 @@ define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB35_5: ; %Flow1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1 @@ -714,7 +714,7 @@ define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -758,7 +758,7 @@ define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -826,7 +826,7 @@ define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -870,7 +870,7 @@ define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -1009,7 +1009,7 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_i64 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB52_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -1052,7 +1052,7 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_i64 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB53_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -1095,7 +1095,7 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_max_i64 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB54_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -1138,7 +1138,7 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_max_i64 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB55_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -1181,7 +1181,7 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u64 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB56_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -1224,7 +1224,7 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u64 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB57_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -1267,7 +1267,7 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_max_u64 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB58_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -1310,7 +1310,7 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_max_u64 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE +; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-NEXT: .LBB59_4: ; %atomicrmw.phi ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 02ead572145f9..752a87ac3cb73 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -344,7 +344,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 ; GFX1250-NEXT: s_endpgm entry: %a.cvt = fptrunc float %a to bfloat @@ -380,7 +380,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 ; GFX1250-NEXT: s_endpgm entry: %a.abs = call float @llvm.fabs.f32(float %a) @@ -417,7 +417,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 ; GFX1250-NEXT: s_endpgm entry: %a.neg = fneg float %a @@ -480,7 +480,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { ; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 ; GFX1250-NEXT: s_endpgm entry: %a.cvt = fptrunc double %a to bfloat @@ -543,7 +543,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 ; GFX1250-NEXT: s_endpgm entry: %a.neg = fneg double %a @@ -607,7 +607,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 ; GFX1250-NEXT: s_endpgm entry: %a.abs = call double @llvm.fabs.f64(double %a) diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 1bf37d512f845..f8c2ddf0d7d3c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -577,7 +577,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off -; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -625,7 +625,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off -; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -761,7 +761,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off -; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -812,7 +812,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off -; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -948,7 +948,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo -; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE +; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn: @@ -987,7 +9... [truncated] |
rampitec approved these changes Sep 9, 2025
Contributor Author
Merge activity
|
f1e9273 to 2a7f810 Compare Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Add this suggestion to a batch that can be applied as a single commit. This suggestion is invalid because no changes were made to the code. Suggestions cannot be applied while the pull request is closed. Suggestions cannot be applied while viewing a subset of changes. Only one suggestion per line can be applied in a batch. Add this suggestion to a batch that can be applied as a single commit. Applying suggestions on deleted lines is not supported. You must change the existing code in this line in order to create a valid suggestion. Outdated suggestions cannot be applied. This suggestion has been applied or marked resolved. Suggestions cannot be applied from pending reviews. Suggestions cannot be applied on multi-line comments. Suggestions cannot be applied while the pull request is queued to merge. Suggestion cannot be applied right now. Please check back later.

No description provided.