[DAGCombine] Propagate truncate to operands #98666

justinfargnoli · 2024-07-12T17:34:46Z

Propagate ISD::TRUNCATE's to the operands of logical operations if a target is particularly sensitive to register pressure.

llvmbot · 2024-07-12T17:35:18Z

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-nvptx

Author: Justin Fargnoli (justinfargnoli)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/98666.diff

2 Files Affected:

(modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+54-1)
(added) llvm/test/CodeGen/NVPTX/combine-truncate.ll (+90)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 476a532db0a37..26729c7adb020 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -725,7 +725,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // We have some custom DAG combine patterns for these nodes setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, - ISD::VSELECT}); + ISD::TRUNCATE, ISD::VSELECT}); // setcc for f16x2 and bf16x2 needs special handling to prevent // legalizer's attempt to scalarize it due to v2i1 not being legal. @@ -5541,6 +5541,53 @@ static SDValue PerformREMCombine(SDNode *N, return SDValue(); } +// truncate (logic_op x, y) --> logic_op (truncate x), (truncate y) +// This will reduce register pressure. +static SDValue PerformTruncCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue LogicalOp = N->getOperand(0); + switch (LogicalOp.getOpcode()) { + default: + break; + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + EVT VT = N->getValueType(0); + EVT LogicalVT = LogicalOp.getValueType(); + if (VT != MVT::i32 || LogicalVT != MVT::i64) + break; + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (!VT.isScalarInteger() && + !TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) + break; + if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) { + return U->isMachineOpcode() + ? U->getMachineOpcode() == NVPTX::CVT_u32_u64 + : U->getOpcode() == ISD::TRUNCATE; + })) + break; + + SDLoc DL(N); + SDValue CVTNone = + DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32); + SDNode *NarrowL = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT, + LogicalOp.getOperand(0), CVTNone); + SDNode *NarrowR = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT, + LogicalOp.getOperand(1), CVTNone); + return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT, SDValue(NarrowL, 0), + SDValue(NarrowR, 0)); + } + } + + return SDValue(); +} + enum OperandSignedness { Signed = 0, Unsigned, @@ -5957,6 +6004,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::UREM: case ISD::SREM: return PerformREMCombine(N, DCI, OptLevel); + case ISD::TRUNCATE: + return PerformTruncCombine(N, DCI); case ISD::SETCC: return PerformSETCCCombine(N, DCI, STI.getSmVersion()); case ISD::LOAD: @@ -5974,6 +6023,10 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::VSELECT: return PerformVSELECTCombine(N, DCI); } + + if (N->isMachineOpcode() && N->getMachineOpcode() == NVPTX::CVT_u32_u64) + return PerformTruncCombine(N, DCI); + return SDValue(); } diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll new file mode 100644 index 0000000000000..30e415ebe9527 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +define i32 @trunc(i64 %a, i64 %b) { +; CHECK-LABEL: trunc( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [trunc_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [trunc_param_1]; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd2; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; +; CHECK-NEXT: or.b32 %r3, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: ret; + %or = or i64 %a, %b + %trunc = trunc i64 %or to i32 + ret i32 %trunc +} + +define i32 @trunc_not(i64 %a, i64 %b) { +; CHECK-LABEL: trunc_not( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [trunc_not_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [trunc_not_param_1]; +; CHECK-NEXT: or.b64 %rd3, %rd1, %rd2; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd3; +; CHECK-NEXT: mov.u64 %rd4, 0; +; CHECK-NEXT: st.u64 [%rd4], %rd3; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: ret; + %or = or i64 %a, %b + %trunc = trunc i64 %or to i32 + store i64 %or, ptr null + ret i32 %trunc +} + +define i32 @trunc_cvt(i64 %a, i64 %b) { +; CHECK-LABEL: trunc_cvt( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [trunc_cvt_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [trunc_cvt_param_1]; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd2; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; +; CHECK-NEXT: add.s32 %r3, %r2, %r1; +; CHECK-NEXT: or.b32 %r4, %r3, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; +; CHECK-NEXT: ret; + %add = add i64 %a, %b + %or = or i64 %add, %a + %trunc = trunc i64 %or to i32 + ret i32 %trunc +} + +define i32 @trunc_cvt_not(i64 %a, i64 %b) { +; CHECK-LABEL: trunc_cvt_not( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [trunc_cvt_not_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [trunc_cvt_not_param_1]; +; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2; +; CHECK-NEXT: mov.u64 %rd4, 0; +; CHECK-NEXT: st.u64 [%rd4], %rd3; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd3; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; +; CHECK-NEXT: or.b32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3; +; CHECK-NEXT: ret; + %add = add i64 %a, %b + store i64 %add, ptr null + %or = or i64 %add, %a + %trunc = trunc i64 %or to i32 + ret i32 %trunc +}

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

tschuett · 2024-07-14T19:26:56Z

The DAGCombiner has the same combine:
https://github.com/llvm/llvm-project/blob/ae63db78828621feca713016e33f34717da626a5/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp#L15170

Artem-B · 2024-07-15T19:19:10Z

The DAGCombiner has the same combine: https://github.com/llvm/llvm-project/blob/ae63db78828621feca713016e33f34717da626a5/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp#L15170

Interesting, then the question is -- why it apparently doesn't it fire in NVPTX. That may be what needs fixing.

tschuett · 2024-07-15T19:29:11Z

The DAGCombiner has the same combine: https://github.com/llvm/llvm-project/blob/ae63db78828621feca713016e33f34717da626a5/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp#L15170

Interesting, then the question is -- why it apparently doesn't it fire in NVPTX. That may be what needs fixing.

It expects one of the parameters of the binop to be constant. This combine seems to be less restrictive.

Artem-B · 2024-07-15T20:31:43Z

It expects one of the parameters of the binop to be constant.

I see. I guess the assumption is that if one of the argument is a constant, then truncation for it is free, so it never increases the number of truncates and assumes that the logical op is the same cost, regardless of the size.

I think adding a new target-specific check here would be appropriate. Considering that register pressure is a pretty common issue for NVPTX, something fairly generic like TLI.shouldReduceRegisterPressure() may be useful here and likely in other places where we may need to adjust optimizer assumptions for NVPTX.

tschuett · 2024-07-15T20:37:14Z

It expects one of the parameters of the binop to be constant.

I see. I guess the assumption is that if one of the argument is a constant, then truncation for it is free, so it never increases the number of truncates and assumes that the logical op is the same cost, regardless of the size.

I think adding a new target-specific check here would be appropriate. Considering that register pressure is a pretty common issue for NVPTX, something fairly generic like TLI.shouldReduceRegisterPressure() may be useful here and likely in other places where we may need to adjust optimizer assumptions for NVPTX.

Your 128-bit mul could become a 32-bit mul after the combine.

Artem-B · 2024-07-15T21:02:17Z

Your 128-bit mul could become a 32-bit mul after the combine.

Mul is the odd one out on the list as it nominally has different number of valid bits in the operands and the result.
If the target has widening mul (NVPTX does), then the operands could be indeed truncated to half of the result size.

I think the optimization for mul could be improved by checking if UMUL_LOHI is legal and using that with operands truncated to half the size. That's somewhat orthogonal to a simpler choice whether it's beneficial to use an additional truncate but save on the number of registers used. Let's gat this part sorted out first.

github-actions · 2024-07-15T22:17:44Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Artem-B

Nice. LGTM with a few minor nits.

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/test/CodeGen/NVPTX/boolean-patterns.ll

llvm/test/CodeGen/NVPTX/combine-truncate.ll

llvm/include/llvm/CodeGen/TargetLowering.h

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/lib/Target/NVPTX/NVPTXISelLowering.h

llvm/test/CodeGen/X86/vec_saddo.ll

RKSimon · 2024-09-03T12:18:11Z

@justinfargnoli reverse-ping?

arsenm · 2025-05-06T13:16:36Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+ SDValue TruncatedOp =
+ DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+ if (TLI.IsDesirableToPromoteOp(TruncatedOp, SrcVT))
+ break;


Needing to speculatively create some nodes in case it's profitable is unfortunate, is there a way to avoid this

arsenm · 2025-05-06T13:17:03Z

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

 case ISD::MUL:
 case ISD::SETCC:
 case ISD::SELECT:
+ if (DestVT.getScalarSizeInBits() == 1)


getScalarType() == i1

justinfargnoli requested review from Artem-B and jlebar July 12, 2024 17:34

justinfargnoli self-assigned this Jul 12, 2024

llvmbot added the backend:NVPTX label Jul 12, 2024

justinfargnoli changed the title ~~[NVPTX] Propagate ISD::TRUNCATE to operands to reduce register pressure~~ [NVPTX] Propagate truncate to operands Jul 12, 2024

Artem-B reviewed Jul 12, 2024

View reviewed changes

justinfargnoli commented Jul 15, 2024

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Outdated Show resolved Hide resolved

justinfargnoli force-pushed the dev/jf/trunc-upstream branch from f987431 to 7fb23e5 Compare July 17, 2024 01:49

justinfargnoli commented Jul 17, 2024

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Outdated Show resolved Hide resolved

justinfargnoli added the llvm:SelectionDAG SelectionDAGISel as well label Jul 17, 2024

justinfargnoli requested review from AlexMaclean, Artem-B and rotateright July 17, 2024 20:02

Artem-B approved these changes Jul 17, 2024

View reviewed changes

arsenm requested changes Jul 18, 2024

View reviewed changes

llvm/include/llvm/CodeGen/TargetLowering.h Outdated Show resolved Hide resolved

justinfargnoli changed the title ~~[NVPTX] Propagate truncate to operands~~ [DAGCombine] Propagate truncate to operands Jul 22, 2024

justinfargnoli requested a review from arsenm July 23, 2024 00:48

RKSimon reviewed Jul 23, 2024

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Outdated Show resolved Hide resolved

arsenm reviewed Jul 24, 2024

View reviewed changes

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Outdated Show resolved Hide resolved

RKSimon reviewed Jul 25, 2024

View reviewed changes

llvm/lib/Target/NVPTX/NVPTXISelLowering.h Outdated Show resolved Hide resolved

llvm/test/CodeGen/X86/vec_saddo.ll Outdated Show resolved Hide resolved

justinfargnoli force-pushed the dev/jf/trunc-upstream branch from 98cc543 to 7632ec0 Compare September 19, 2024 02:53

justinfargnoli added 21 commits May 5, 2025 23:19

Update 2 X86 tests that I missed

6c2ae42

Update register names for NVPTX/boolean-patterns.ll

e3f8d9f

Move isTypeLegal condition up

13edcaa

Update AMDGPU/computeNumSignBits-mul.ll

a52b459

Address comment

06a0e68

Prevent infinite loop by checking TLI.IsDesirableToPromoteOp()

401bb07

Fixup previous commit

d887b68

clang-format

5811e11

Use new isNarrowingProfitable API

0e931f3

Update tests on ToT

281f897

Update NVPTX isNarrowingProfitable API

f1fee8e

Update AMDGPU tests

47fadcd

Pass the correct SDNode to isNarrowingProfitable in DAGCombiner

9ed8f93

Update AMDGPU tests

2aa5bbc

Update amdgpu-codegenprepare-idiv.ll

2f9995f

Update NVPTX tests

1d681b9

Address comments

bbf1635

[AMDGPU] Cleanup xor.ll

a6c2465

[AMDGPU] revert xor.ll si-annotate-cf.ll changes

7f5bd91

[AMDGPU] Revert and.ll changes

5ab861f

[AMDGPU] It's not profitable to narrow to i1

6dfabf6

justinfargnoli added help wanted Indicates that a maintainer wants help. Not [good first issue]. backend:AMDGPU backend:X86 labels May 6, 2025

justinfargnoli added 3 commits May 6, 2025 04:20

Fix rebase

d3b68b1

Fix AMD infinite loop

53965f2

Update tests

99da17d

justinfargnoli force-pushed the dev/jf/trunc-upstream branch from 5e4123e to 99da17d Compare May 6, 2025 04:21

arsenm reviewed May 6, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[DAGCombine] Propagate truncate to operands #98666

[DAGCombine] Propagate truncate to operands #98666

Uh oh!

justinfargnoli commented Jul 12, 2024 •

edited

Loading

llvmbot commented Jul 12, 2024 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

tschuett commented Jul 14, 2024

Artem-B commented Jul 15, 2024

tschuett commented Jul 15, 2024

Artem-B commented Jul 15, 2024

tschuett commented Jul 15, 2024

Artem-B commented Jul 15, 2024

github-actions bot commented Jul 15, 2024 •

edited

Loading

Uh oh!

Uh oh!

Artem-B left a comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

RKSimon commented Sep 3, 2024

arsenm May 6, 2025

arsenm May 6, 2025

Labels

7 participants

[DAGCombine] Propagate truncate to operands #98666

Are you sure you want to change the base?

[DAGCombine] Propagate truncate to operands #98666

Uh oh!

Conversation

justinfargnoli commented Jul 12, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

llvmbot commented Jul 12, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

tschuett commented Jul 14, 2024

Artem-B commented Jul 15, 2024

tschuett commented Jul 15, 2024

Artem-B commented Jul 15, 2024

tschuett commented Jul 15, 2024

Artem-B commented Jul 15, 2024

github-actions bot commented Jul 15, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Artem-B left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

RKSimon commented Sep 3, 2024

arsenm May 6, 2025

Choose a reason for hiding this comment

arsenm May 6, 2025

Choose a reason for hiding this comment

Labels

7 participants

justinfargnoli commented Jul 12, 2024 •

edited

Loading

llvmbot commented Jul 12, 2024 •

edited

Loading

github-actions bot commented Jul 15, 2024 •

edited

Loading