Skip to content

Conversation

@MatzeB
Copy link
Contributor

@MatzeB MatzeB commented Apr 11, 2024

This changes GlobalOpt to skip/look-through threadlocal.address intrinsic where apropriate.

This fixes issue #73314

@MatzeB MatzeB force-pushed the threadlocal_globalopt branch from 3cb0601 to ce16ee0 Compare April 16, 2024 00:56
@MatzeB MatzeB marked this pull request as ready for review April 16, 2024 01:05
@llvmbot
Copy link
Member

llvmbot commented Apr 16, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Matthias Braun (MatzeB)

Changes

This changes GlobalOpt to skip/look-through threadlocal.address intrinsic where apropriate.

This fixes issue #73314 and improves issue #87437


Full diff: https://github.com/llvm/llvm-project/pull/88454.diff

7 Files Affected:

  • (modified) llvm/include/llvm/Transforms/Utils/GlobalStatus.h (+3-3)
  • (modified) llvm/lib/Transforms/IPO/GlobalOpt.cpp (+7)
  • (modified) llvm/lib/Transforms/Utils/GlobalStatus.cpp (+14-3)
  • (modified) llvm/test/Transforms/GlobalOpt/basictest.ll (+17-3)
  • (modified) llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll (+3-2)
  • (modified) llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll (+4-2)
  • (modified) llvm/test/Transforms/GlobalOpt/tls.ll (+8-4)
diff --git a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h index 60c91fc30174de..c001e587313c56 100644 --- a/llvm/include/llvm/Transforms/Utils/GlobalStatus.h +++ b/llvm/include/llvm/Transforms/Utils/GlobalStatus.h @@ -24,9 +24,9 @@ class Value; /// bool isSafeToDestroyConstant(const Constant *C); -/// As we analyze each global, keep track of some information about it. If we -/// find out that the address of the global is taken, none of this info will be -/// accurate. +/// As we analyze each global or thread-local variable, keep track of some +/// information about it. If we find out that the address of the global is +/// taken, none of this info will be accurate. struct GlobalStatus { /// True if the global's address is used in a comparison. bool IsCompared = false; diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index da714c9a75701b..fbb83e787f6327 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -306,6 +306,10 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, APInt Offset(DL.getIndexTypeSizeInBits(PtrOp->getType()), 0); PtrOp = PtrOp->stripAndAccumulateConstantOffsets( DL, Offset, /* AllowNonInbounds */ true); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(PtrOp)) { + if (II->getIntrinsicID() == Intrinsic::threadlocal_address) + PtrOp = II->getArgOperand(0); + } if (PtrOp == GV) { if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) { LI->replaceAllUsesWith(Value); @@ -318,6 +322,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv if (getUnderlyingObject(MI->getRawDest()) == GV) EraseFromParent(MI); + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { + if (II->getIntrinsicID() == Intrinsic::threadlocal_address) + append_range(WorkList, II->users()); } } diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp index c5aded3c45f4c7..ba6dba8db954b7 100644 --- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp +++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp @@ -172,9 +172,20 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS, return true; GS.StoredType = GlobalStatus::Stored; } else if (const auto *CB = dyn_cast<CallBase>(I)) { - if (!CB->isCallee(&U)) - return true; - GS.IsLoaded = true; + bool KnownIntrinsic = false; + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::threadlocal_address && + II->getArgOperand(0) == V) { + if (analyzeGlobalAux(I, GS, VisitedUsers)) + return true; + KnownIntrinsic = true; + } + } + if (!KnownIntrinsic) { + if (!CB->isCallee(&U)) + return true; + GS.IsLoaded = true; + } } else { return true; // Any other non-load instruction might take address! } diff --git a/llvm/test/Transforms/GlobalOpt/basictest.ll b/llvm/test/Transforms/GlobalOpt/basictest.ll index 6d7fcdd96dfda7..72d38a1e88457e 100644 --- a/llvm/test/Transforms/GlobalOpt/basictest.ll +++ b/llvm/test/Transforms/GlobalOpt/basictest.ll @@ -1,9 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt < %s -passes=globalopt -S | FileCheck %s -; CHECK-NOT: global @X = internal global i32 4 ; <ptr> [#uses=1] define i32 @foo() { - %V = load i32, ptr @X ; <i32> [#uses=1] - ret i32 %V +; CHECK-LABEL: define i32 @foo() local_unnamed_addr { +; CHECK-NEXT: ret i32 4 +; + %V = load i32, ptr @X ; <i32> [#uses=1] + ret i32 %V +} + +@X_tls = internal thread_local global i32 13 + +define i32 @bar() { +; CHECK-LABEL: define i32 @bar() local_unnamed_addr { +; CHECK-NEXT: ret i32 13 +; + %p = call ptr @llvm.threadlocal.address(ptr @X_tls) + %v = load i32, ptr %p + ret i32 %v } diff --git a/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll b/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll index ca844f63937ca3..f82942e73d92f7 100644 --- a/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll +++ b/llvm/test/Transforms/GlobalOpt/constantfold-initializers.ll @@ -72,11 +72,12 @@ entry: } @threadlocalptr = global ptr null, align 4 -; CHECK: @threadlocalptr = global ptr null, align 4 +; CHECK: @threadlocalptr = local_unnamed_addr global ptr null, align 4 @threadlocalvar = external thread_local global i32 define internal void @test5() { entry: - store ptr @threadlocalvar, ptr @threadlocalptr, align 4 + %p = call ptr @llvm.threadlocal.address(ptr @threadlocalvar) + store ptr %p, ptr @threadlocalptr, align 4 ret void } diff --git a/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll b/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll index 7b845070bbd0b3..2b7ceb4169f350 100644 --- a/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll +++ b/llvm/test/Transforms/GlobalOpt/stored-once-forward-value.ll @@ -39,12 +39,14 @@ define i32 @dom_arg(i32 %a) { define ptr @dom_thread_local_global() { ; CHECK-LABEL: @dom_thread_local_global( -; CHECK-NEXT: store ptr @tl, ptr @g3, align 8 +; CHECK-NEXT: [[P:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tl) +; CHECK-NEXT: store ptr [[P]], ptr @g3, align 8 ; CHECK-NEXT: call void @b() ; CHECK-NEXT: [[R:%.*]] = load ptr, ptr @g3, align 8 ; CHECK-NEXT: ret ptr [[R]] ; - store ptr @tl, ptr @g3 + %p = call ptr @llvm.threadlocal.address(ptr @tl) + store ptr %p, ptr @g3 call void @b() %r = load ptr, ptr @g3 ret ptr %r diff --git a/llvm/test/Transforms/GlobalOpt/tls.ll b/llvm/test/Transforms/GlobalOpt/tls.ll index 6ba003ff30b2ea..2cc2ea4e366e34 100644 --- a/llvm/test/Transforms/GlobalOpt/tls.ll +++ b/llvm/test/Transforms/GlobalOpt/tls.ll @@ -15,14 +15,16 @@ declare void @start_thread(ptr) define i32 @f() { entry: ; Set @ip to point to x[1] for thread 1. - store ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), ptr @ip, align 8 + %p = call ptr @llvm.threadlocal.address(ptr @x) + %addr = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 1 + store ptr %addr, ptr @ip, align 8 ; Run g on a new thread. tail call void @start_thread(ptr @g) nounwind tail call void @wait() nounwind ; Reset x[1] for thread 1. - store i32 0, ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), align 4 + store i32 0, ptr %addr, align 4 ; Read the value of @ip, which now points at x[1] for thread 2. %0 = load ptr, ptr @ip, align 8 @@ -39,10 +41,12 @@ entry: define internal void @g() nounwind uwtable { entry: ; Set @ip to point to x[1] for thread 2. - store ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), ptr @ip, align 8 + %p = call ptr @llvm.threadlocal.address(ptr @x) + %addr = getelementptr inbounds [100 x i32], ptr %p, i64 0, i64 1 + store ptr %addr, ptr @ip, align 8 ; Store 50 in x[1] for thread 2. - store i32 50, ptr getelementptr inbounds ([100 x i32], ptr @x, i64 0, i64 1), align 4 + store i32 50, ptr %addr, align 4 tail call void @signal() nounwind ret void 
@MatzeB
Copy link
Contributor Author

MatzeB commented Apr 17, 2024

For the record: I believe there is no additional problems with coroutines (functions changing thread-id) with this optimization given GlobalOpt computes summaries for the whole module independent of control flow anyway.

@MatzeB MatzeB requested a review from efriedma-quic April 23, 2024 16:46
@MatzeB
Copy link
Contributor Author

MatzeB commented Apr 23, 2024

ping.

One more patch in my series about about threadlocal.address; while I don't think this has any actual performance implications for our use cases, it seemed easy enough to fix while I was looking at things and it addresses an outstanding github issue.

Copy link
Collaborator

@efriedma-quic efriedma-quic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@MatzeB
Copy link
Contributor Author

MatzeB commented Apr 29, 2024

(Ignoring unrelated "TextDiagnosticPrinter.h(23): fatal error C1060: compiler is out of heap space" errors from windows build)

@MatzeB MatzeB merged commit dede19c into llvm:main Apr 29, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

4 participants