@@ -178,6 +178,55 @@ exit:
178178 ret void
179179}
180180
181+ define void @divergent_i1_xor_used_outside_loop_twice (float %val , float %pre.cond.val , ptr %addr , ptr %addr2 ) {
182+ ; GFX10-LABEL: divergent_i1_xor_used_outside_loop_twice:
183+ ; GFX10: ; %bb.0: ; %entry
184+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185+ ; GFX10-NEXT: s_mov_b32 s4, 0
186+ ; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
187+ ; GFX10-NEXT: v_mov_b32_e32 v1, s4
188+ ; GFX10-NEXT: ; implicit-def: $sgpr6
189+ ; GFX10-NEXT: .LBB3_1: ; %loop
190+ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
191+ ; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v1
192+ ; GFX10-NEXT: s_xor_b32 s5, s5, -1
193+ ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
194+ ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0
195+ ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
196+ ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
197+ ; GFX10-NEXT: s_and_b32 s7, exec_lo, s5
198+ ; GFX10-NEXT: s_or_b32 s6, s6, s7
199+ ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
200+ ; GFX10-NEXT: s_cbranch_execnz .LBB3_1
201+ ; GFX10-NEXT: ; %bb.2: ; %exit
202+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
203+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
204+ ; GFX10-NEXT: v_cndmask_b32_e64 v1, -1.0, 2.0, s6
205+ ; GFX10-NEXT: flat_store_dword v[2:3], v0
206+ ; GFX10-NEXT: flat_store_dword v[4:5], v1
207+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
208+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
209+ entry:
210+ %pre.cond = fcmp ogt float %pre.cond.val , 1 .0
211+ br label %loop
212+
213+ loop:
214+ %counter = phi i32 [ 0 , %entry ], [ %counter.plus.1 , %loop ]
215+ %bool.counter = phi i1 [ %pre.cond , %entry ], [ %neg.bool.counter , %loop ]
216+ %neg.bool.counter = xor i1 %bool.counter , true
217+ %f.counter = uitofp i32 %counter to float
218+ %cond = fcmp ogt float %f.counter , %val
219+ %counter.plus.1 = add i32 %counter , 1
220+ br i1 %cond , label %exit , label %loop
221+
222+ exit:
223+ %select = select i1 %neg.bool.counter , float 1 .000000e+00 , float 0 .000000e+00
224+ store float %select , ptr %addr
225+ %select2 = select i1 %neg.bool.counter , float 2 .000000e+00 , float -1 .000000e+00
226+ store float %select2 , ptr %addr2
227+ ret void
228+ }
229+
181230;void xor(int num_elts, int* a, int* addr) {
182231;for(int i=0; i<num_elts; ++i) {
183232; if(a[i]==0)
@@ -195,15 +244,15 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
195244; GFX10-NEXT: s_mov_b32 s5, 0
196245; GFX10-NEXT: s_mov_b32 s6, -1
197246; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
198- ; GFX10-NEXT: s_cbranch_execz .LBB3_6
247+ ; GFX10-NEXT: s_cbranch_execz .LBB4_6
199248; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
200249; GFX10-NEXT: v_mov_b32_e32 v5, s5
201250; GFX10-NEXT: ; implicit-def: $sgpr6
202251; GFX10-NEXT: ; implicit-def: $sgpr7
203252; GFX10-NEXT: ; implicit-def: $sgpr8
204- ; GFX10-NEXT: s_branch .LBB3_3
205- ; GFX10-NEXT: .LBB3_2 : ; %Flow
206- ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
253+ ; GFX10-NEXT: s_branch .LBB4_3
254+ ; GFX10-NEXT: .LBB4_2 : ; %Flow
255+ ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1
207256; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
208257; GFX10-NEXT: s_xor_b32 s9, s8, -1
209258; GFX10-NEXT: s_and_b32 s10, exec_lo, s7
@@ -212,8 +261,8 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
212261; GFX10-NEXT: s_and_b32 s9, exec_lo, s9
213262; GFX10-NEXT: s_or_b32 s6, s6, s9
214263; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
215- ; GFX10-NEXT: s_cbranch_execz .LBB3_5
216- ; GFX10-NEXT: .LBB3_3 : ; %loop.start
264+ ; GFX10-NEXT: s_cbranch_execz .LBB4_5
265+ ; GFX10-NEXT: .LBB4_3 : ; %loop.start
217266; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
218267; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
219268; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
@@ -228,9 +277,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
228277; GFX10-NEXT: s_waitcnt vmcnt(0)
229278; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
230279; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo
231- ; GFX10-NEXT: s_cbranch_execz .LBB3_2
280+ ; GFX10-NEXT: s_cbranch_execz .LBB4_2
232281; GFX10-NEXT: ; %bb.4: ; %loop.cond
233- ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
282+ ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1
234283; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5
235284; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
236285; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
@@ -240,20 +289,20 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
240289; GFX10-NEXT: s_and_b32 s11, exec_lo, vcc_lo
241290; GFX10-NEXT: s_or_b32 s8, s8, s10
242291; GFX10-NEXT: s_or_b32 s7, s7, s11
243- ; GFX10-NEXT: s_branch .LBB3_2
244- ; GFX10-NEXT: .LBB3_5 : ; %loop.exit.guard
292+ ; GFX10-NEXT: s_branch .LBB4_2
293+ ; GFX10-NEXT: .LBB4_5 : ; %loop.exit.guard
245294; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
246295; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo
247296; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
248297; GFX10-NEXT: s_or_b32 s6, s5, s6
249- ; GFX10-NEXT: .LBB3_6 : ; %Flow1
298+ ; GFX10-NEXT: .LBB4_6 : ; %Flow1
250299; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
251300; GFX10-NEXT: s_and_saveexec_b32 s4, s6
252- ; GFX10-NEXT: s_cbranch_execz .LBB3_8
301+ ; GFX10-NEXT: s_cbranch_execz .LBB4_8
253302; GFX10-NEXT: ; %bb.7: ; %block.after.loop
254303; GFX10-NEXT: v_mov_b32_e32 v0, 5
255304; GFX10-NEXT: flat_store_dword v[3:4], v0
256- ; GFX10-NEXT: .LBB3_8 : ; %exit
305+ ; GFX10-NEXT: .LBB4_8 : ; %exit
257306; GFX10-NEXT: s_waitcnt_depctr 0xffe3
258307; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
259308; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -299,53 +348,53 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
299348; GFX10-NEXT: s_mov_b32 s5, 0
300349; GFX10-NEXT: ; implicit-def: $sgpr6
301350; GFX10-NEXT: v_mov_b32_e32 v5, s5
302- ; GFX10-NEXT: s_branch .LBB4_2
303- ; GFX10-NEXT: .LBB4_1 : ; %Flow
304- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
351+ ; GFX10-NEXT: s_branch .LBB5_2
352+ ; GFX10-NEXT: .LBB5_1 : ; %Flow
353+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
305354; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
306355; GFX10-NEXT: s_and_b32 s4, exec_lo, s7
307356; GFX10-NEXT: s_or_b32 s5, s4, s5
308357; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
309358; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
310359; GFX10-NEXT: s_or_b32 s6, s4, s6
311360; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
312- ; GFX10-NEXT: s_cbranch_execz .LBB4_6
313- ; GFX10-NEXT: .LBB4_2 : ; %cond.block.0
361+ ; GFX10-NEXT: s_cbranch_execz .LBB5_6
362+ ; GFX10-NEXT: .LBB5_2 : ; %cond.block.0
314363; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
315364; GFX10-NEXT: v_mov_b32_e32 v4, v5
316365; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
317366; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
318- ; GFX10-NEXT: s_cbranch_execz .LBB4_4
367+ ; GFX10-NEXT: s_cbranch_execz .LBB5_4
319368; GFX10-NEXT: ; %bb.3: ; %if.block.0
320- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
369+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
321370; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
322371; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5]
323372; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8
324373; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4
325374; GFX10-NEXT: global_store_dword v[8:9], v4, off
326- ; GFX10-NEXT: .LBB4_4 : ; %loop.break.block
327- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
375+ ; GFX10-NEXT: .LBB5_4 : ; %loop.break.block
376+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
328377; GFX10-NEXT: s_waitcnt_depctr 0xffe3
329378; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
330379; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
331380; GFX10-NEXT: s_mov_b32 s7, -1
332381; GFX10-NEXT: ; implicit-def: $vgpr5
333382; GFX10-NEXT: s_and_saveexec_b32 s8, s4
334- ; GFX10-NEXT: s_cbranch_execz .LBB4_1
383+ ; GFX10-NEXT: s_cbranch_execz .LBB5_1
335384; GFX10-NEXT: ; %bb.5: ; %loop.cond
336- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
385+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
337386; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4
338387; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
339388; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
340389; GFX10-NEXT: s_or_b32 s7, s4, s7
341- ; GFX10-NEXT: s_branch .LBB4_1
342- ; GFX10-NEXT: .LBB4_6 : ; %cond.block.1
390+ ; GFX10-NEXT: s_branch .LBB5_1
391+ ; GFX10-NEXT: .LBB5_6 : ; %cond.block.1
343392; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
344393; GFX10-NEXT: s_and_saveexec_b32 s4, s6
345- ; GFX10-NEXT: s_cbranch_execz .LBB4_8
394+ ; GFX10-NEXT: s_cbranch_execz .LBB5_8
346395; GFX10-NEXT: ; %bb.7: ; %if.block.1
347396; GFX10-NEXT: global_store_dword v[6:7], v4, off
348- ; GFX10-NEXT: .LBB4_8 : ; %exit
397+ ; GFX10-NEXT: .LBB5_8 : ; %exit
349398; GFX10-NEXT: s_waitcnt_depctr 0xffe3
350399; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
351400; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -410,9 +459,9 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
410459; GFX10-NEXT: v_mov_b32_e32 v5, s0
411460; GFX10-NEXT: ; implicit-def: $sgpr1
412461; GFX10-NEXT: ; implicit-def: $sgpr2
413- ; GFX10-NEXT: s_branch .LBB5_2
414- ; GFX10-NEXT: .LBB5_1 : ; %loop.cond
415- ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
462+ ; GFX10-NEXT: s_branch .LBB6_2
463+ ; GFX10-NEXT: .LBB6_1 : ; %loop.cond
464+ ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
416465; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
417466; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
418467; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5
@@ -423,16 +472,16 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
423472; GFX10-NEXT: s_or_b32 s3, s3, s4
424473; GFX10-NEXT: s_or_b32 s1, s1, s4
425474; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
426- ; GFX10-NEXT: s_cbranch_execz .LBB5_4
427- ; GFX10-NEXT: .LBB5_2 : ; %loop.start
475+ ; GFX10-NEXT: s_cbranch_execz .LBB6_4
476+ ; GFX10-NEXT: .LBB6_2 : ; %loop.start
428477; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
429478; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
430479; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
431480; GFX10-NEXT: s_or_b32 s2, s2, s4
432481; GFX10-NEXT: s_and_saveexec_b32 s4, s3
433- ; GFX10-NEXT: s_cbranch_execz .LBB5_1
482+ ; GFX10-NEXT: s_cbranch_execz .LBB6_1
434483; GFX10-NEXT: ; %bb.3: ; %is.eq.zero
435- ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
484+ ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
436485; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
437486; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
438487; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
@@ -444,8 +493,8 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
444493; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo
445494; GFX10-NEXT: s_or_b32 s2, s2, s3
446495; GFX10-NEXT: ; implicit-def: $sgpr3
447- ; GFX10-NEXT: s_branch .LBB5_1
448- ; GFX10-NEXT: .LBB5_4 : ; %exit
496+ ; GFX10-NEXT: s_branch .LBB6_1
497+ ; GFX10-NEXT: .LBB6_4 : ; %exit
449498; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
450499; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
451500; GFX10-NEXT: flat_store_dword v[3:4], v0
@@ -486,9 +535,9 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
486535; GFX10-NEXT: ; implicit-def: $sgpr2
487536; GFX10-NEXT: ; implicit-def: $sgpr3
488537; GFX10-NEXT: v_mov_b32_e32 v6, s0
489- ; GFX10-NEXT: s_branch .LBB6_2
490- ; GFX10-NEXT: .LBB6_1 : ; %Flow
491- ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
538+ ; GFX10-NEXT: s_branch .LBB7_2
539+ ; GFX10-NEXT: .LBB7_1 : ; %Flow
540+ ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
492541; GFX10-NEXT: s_waitcnt_depctr 0xffe3
493542; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
494543; GFX10-NEXT: s_and_b32 s4, exec_lo, s2
@@ -497,8 +546,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
497546; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
498547; GFX10-NEXT: s_or_b32 s1, s1, s4
499548; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
500- ; GFX10-NEXT: s_cbranch_execz .LBB6_4
501- ; GFX10-NEXT: .LBB6_2 : ; %A
549+ ; GFX10-NEXT: s_cbranch_execz .LBB7_4
550+ ; GFX10-NEXT: .LBB7_2 : ; %A
502551; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
503552; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
504553; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
@@ -513,9 +562,9 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
513562; GFX10-NEXT: s_waitcnt vmcnt(0)
514563; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
515564; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
516- ; GFX10-NEXT: s_cbranch_execz .LBB6_1
565+ ; GFX10-NEXT: s_cbranch_execz .LBB7_1
517566; GFX10-NEXT: ; %bb.3: ; %loop.body
518- ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
567+ ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
519568; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
520569; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
521570; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
@@ -531,16 +580,16 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
531580; GFX10-NEXT: s_waitcnt vmcnt(0)
532581; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
533582; GFX10-NEXT: global_store_dword v[7:8], v9, off
534- ; GFX10-NEXT: s_branch .LBB6_1
535- ; GFX10-NEXT: .LBB6_4 : ; %loop.exit.guard
583+ ; GFX10-NEXT: s_branch .LBB7_1
584+ ; GFX10-NEXT: .LBB7_4 : ; %loop.exit.guard
536585; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
537586; GFX10-NEXT: s_and_saveexec_b32 s0, s1
538587; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
539- ; GFX10-NEXT: s_cbranch_execz .LBB6_6
588+ ; GFX10-NEXT: s_cbranch_execz .LBB7_6
540589; GFX10-NEXT: ; %bb.5: ; %break.body
541590; GFX10-NEXT: v_mov_b32_e32 v0, 10
542591; GFX10-NEXT: global_store_dword v[4:5], v0, off
543- ; GFX10-NEXT: .LBB6_6 : ; %exit
592+ ; GFX10-NEXT: .LBB7_6 : ; %exit
544593; GFX10-NEXT: s_endpgm
545594entry:
546595 br label %A
0 commit comments