Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions llvm/include/llvm/ProfileData/SampleProf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1214,13 +1214,19 @@ class FunctionSamples {
// Note the sequence of the suffixes in the knownSuffixes array matters.
// If suffix "A" is appended after the suffix "B", "A" should be in front
// of "B" in knownSuffixes.
const char *KnownSuffixes[] = {LLVMSuffix, PartSuffix, UniqSuffix};
const SmallVector<StringRef> KnownSuffixes{LLVMSuffix, PartSuffix,
UniqSuffix};
return getCanonicalFnName(FnName, KnownSuffixes, Attr);
}

static StringRef getCanonicalFnName(StringRef FnName,
ArrayRef<StringRef> Suffixes,
StringRef Attr = "selected") {
if (Attr == "" || Attr == "all")
return FnName.split('.').first;
if (Attr == "selected") {
StringRef Cand(FnName);
for (const auto &Suf : KnownSuffixes) {
StringRef Suffix(Suf);
for (const auto Suffix : Suffixes) {
// If the profile contains ".__uniq." suffix, don't strip the
// suffix for names in the IR.
if (Suffix == UniqSuffix && FunctionSamples::HasUniqSuffix)
Expand All @@ -1229,7 +1235,7 @@ class FunctionSamples {
if (It == StringRef::npos)
continue;
auto Dit = Cand.rfind('.');
if (Dit == It + Suffix.size() - 1)
if (Dit == It || Dit == It + Suffix.size() - 1)
Cand = Cand.substr(0, It);
}
return Cand;
Expand Down
Binary file not shown.
37 changes: 37 additions & 0 deletions llvm/test/tools/llvm-profgen/missing-dwarf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; RUN: rm -rf %t
; RUN: mkdir -p %t
; RUN: cd %t

; RUN: echo -e "1\n401120-40113b:1\n1\n40112f->401110:1" > %t.prof

; Test --load-function-from-symbol=0
; RUN: llvm-profgen --format=text --unsymbolized-profile=%t.prof --binary=%S/Inputs/missing-dwarf.exe --output=%t1 --fill-zero-for-all-funcs --show-detailed-warning --use-offset=0 --load-function-from-symbol=0 2>&1 | FileCheck %s --check-prefix=CHECK-NO-LOAD-SYMTAB

; CHECK-NO-LOAD-SYMTAB: warning: Loading of DWARF info completed, but no binary functions have been retrieved.

; Test --load-function-from-symbol=1
; RUN: llvm-profgen --format=text --unsymbolized-profile=%t.prof --binary=%S/Inputs/missing-dwarf.exe --output=%t2 --fill-zero-for-all-funcs --show-detailed-warning --use-offset=0 --load-function-from-symbol=1
; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-LOAD-SYMTAB

; CHECK-LOAD-SYMTAB: main:2:1
; CHECK-LOAD-SYMTAB-NEXT: 1: 1
; CHECK-LOAD-SYMTAB-NEXT: 2: 1 foo:1
; CHECK-LOAD-SYMTAB-NEXT: !CFGChecksum: 281479271677951
; CHECK-LOAD-SYMTAB-NEXT: foo:0:0
; CHECK-LOAD-SYMTAB-NEXT: 1: 0
; CHECK-LOAD-SYMTAB-NEXT: !CFGChecksum: 4294967295

; Build instructions:
; missing-dwarf.o: clang -gsplit-dwarf=split -fdebug-compilation-dir=. test.c -fdebug-info-for-profiling -fpseudo-probe-for-profiling -O0 -g -o missing-dwarf.o -c
; missing-dwarf.exe: clang -fdebug-compilation-dir=. missing-dwarf.o -o missing-dwarf.exe -fdebug-info-for-profiling -fpseudo-probe-for-profiling -O0 -g

; Source code:

int foo() {
return 1;
}

int main() {
foo();
return 0;
}
7 changes: 7 additions & 0 deletions llvm/tools/llvm-profgen/PerfReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1284,6 +1284,7 @@ void PerfScriptReader::warnInvalidRange() {
uint64_t TotalRangeNum = 0;
uint64_t InstNotBoundary = 0;
uint64_t UnmatchedRange = 0;
uint64_t RecoveredRange = 0;
uint64_t RangeCrossFunc = 0;
uint64_t BogusRange = 0;

Expand All @@ -1309,6 +1310,9 @@ void PerfScriptReader::warnInvalidRange() {
continue;
}

if (FRange->Func->FromSymtab)
RecoveredRange += I.second;

if (EndAddress >= FRange->EndAddress) {
RangeCrossFunc += I.second;
WarnInvalidRange(StartAddress, EndAddress, RangeCrossFuncMsg);
Expand All @@ -1328,6 +1332,9 @@ void PerfScriptReader::warnInvalidRange() {
emitWarningSummary(
UnmatchedRange, TotalRangeNum,
"of samples are from ranges that do not belong to any functions.");
emitWarningSummary(RecoveredRange, TotalRangeNum,
"of samples are from ranges that belong to functions "
"recovered from symbol table.");
emitWarningSummary(
RangeCrossFunc, TotalRangeNum,
"of samples are from ranges that do cross function boundaries.");
Expand Down
73 changes: 71 additions & 2 deletions llvm/tools/llvm-profgen/ProfiledBinary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ static cl::list<std::string> DisassembleFunctions(
"names only. Only work with show-disassembly-only"),
cl::cat(ProfGenCategory));

static cl::opt<bool>
LoadFunctionFromSymbol("load-function-from-symbol", cl::init(true),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wlei-llvm somehow i remember we had something like that before (long time ago)? did we remove the symbol table loading code in the past?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, iirc, we initially only loaded the function ranges from symbol table, then after we found the issue with the multi-version function(foo.cold, foo.resume...), we redesigned and fully switched to load the symbol from dwarf.

At that time, the code was implemented inside the dissassembleSymbol(which also iterates the symbols),
https://github.com/llvm/llvm-project/blob/main/llvm/tools/llvm-profgen/ProfiledBinary.cpp#L529

Copy link
Contributor

@wlei-llvm wlei-llvm Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reference, this is the patch https://reviews.llvm.org/D112282 we did the switch to use dwarf symbol.

cl::desc("Gather additional binary function info "
"from symbols (e.g. .symtab) in case "
"dwarf info is incomplete."),
cl::cat(ProfGenCategory));

static cl::opt<bool>
KernelBinary("kernel",
cl::desc("Generate the profile for Linux kernel binary."),
Expand Down Expand Up @@ -257,6 +264,9 @@ void ProfiledBinary::load() {
if (ShowDisassemblyOnly)
decodePseudoProbe(Obj);

if (LoadFunctionFromSymbol && UsePseudoProbes)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is pseudo-probe required here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ohh, this is to be compatible with the code here:

if (Binary->usePseudoProbes()) {
generateProbeBasedProfile();
} else {
generateLineNumBasedProfile();
}

UsePseudoProbes is needed so we won't use line numbers to generate profile, which won't work with symtab because symtab only have addresses

loadSymbolsFromSymtab(Obj);

// Disassemble the text sections.
disassemble(Obj);

Expand Down Expand Up @@ -604,13 +614,13 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
// Record potential call targets for tail frame inference later-on.
if (InferMissingFrames && FRange) {
uint64_t Target = 0;
MIA->evaluateBranch(Inst, Address, Size, Target);
bool Err = MIA->evaluateBranch(Inst, Address, Size, Target);
if (MCDesc.isCall()) {
// Indirect call targets are unknown at this point. Recording the
// unknown target (zero) for further LBR-based refinement.
MissingContextInferrer->CallEdges[Address].insert(Target);
} else if (MCDesc.isUnconditionalBranch()) {
assert(Target &&
assert(Err &&
"target should be known for unconditional direct branch");
// Any inter-function unconditional jump is considered tail call at
// this point. This is not 100% accurate and could further be
Expand Down Expand Up @@ -820,6 +830,65 @@ void ProfiledBinary::populateSymbolAddressList(const ObjectFile *Obj) {
}
}

void ProfiledBinary::loadSymbolsFromSymtab(const ObjectFile *Obj) {
// Load binary functions from symbol table when Debug info is incomplete.
// Strip the internal suffixes which are not reflected in the DWARF info.
const SmallVector<StringRef, 10> Suffixes(
{// Internal suffixes from CoroSplit pass
".cleanup", ".destroy", ".resume",
// Internal suffixes from Bolt
".cold", ".warm",
// Compiler/LTO internal
".llvm.", ".part.", ".isra.", ".constprop.", ".lto_priv."});
StringRef FileName = Obj->getFileName();
for (const SymbolRef &Symbol : Obj->symbols()) {
const SymbolRef::Type Type = unwrapOrError(Symbol.getType(), FileName);
const uint64_t StartAddr = unwrapOrError(Symbol.getAddress(), FileName);
const StringRef Name = unwrapOrError(Symbol.getName(), FileName);
uint64_t Size = 0;
if (isa<ELFObjectFileBase>(Symbol.getObject())) {
ELFSymbolRef ElfSymbol(Symbol);
Size = ElfSymbol.getSize();
}

if (Size == 0 || Type != SymbolRef::ST_Function)
continue;

const StringRef SymName =
FunctionSamples::getCanonicalFnName(Name, Suffixes);

auto Range = findFuncRange(StartAddr);
if (!Range || Range->StartAddress != StartAddr) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the cause for a range from symbol table to be found in dwarf, but with different start address?

Isn't that also an "inconsistent" case that we should warn about as part of the "else"?

So basically, if range for StartAddr is found, we expect the dwarf range and symbol table range to be identical, right?

Even if this is a funclet range (e.g. coroutine.resume), the range should still be identical from the one found in dwarf?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, updated the logic to something like this:

if (!Range) { // Function from symbol table not found previously in DWARF, store ranges. } else if (SymName != Range->getFuncName()) { // Function range already found from DWARF, but the symbol name from symbol table is inconsistent with debug info. assert(StartAddr == Range->StartAddress && EndAddr == Range->EndAddress); // Update the function name... } else if (if (StartAddr != Range->StartAddress && EndAddr != Range->EndAddress) { // Function already found in DWARF, but the address range from symbol table conflicts/overlaps with the debug info. llvm_unreachable(); }

So far I only see symbol name conflicts, and all the address ranges matches between symbol table and dwarf..

// Function from symbol table not found previously in DWARF, store ranges.
auto Ret = BinaryFunctions.emplace(SymName, BinaryFunction());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a range is found, we should expect function to be found. probably want to assert(!Range || !Ret.second).

auto &Func = Ret.first->second;
if (Ret.second) {
Func.FuncName = Ret.first->first;
HashBinaryFunctions[MD5Hash(StringRef(SymName))] = &Func;
}

Func.FromSymtab = true;
Func.Ranges.emplace_back(StartAddr, StartAddr + Size);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if a range is found but start address differ, we create a new range, but isn't this creating an overlapping range with existing one (from dwarf)? We should never have two ranges that covers the same address (i.e. ranges must not overlap).


auto R = StartAddrToFuncRangeMap.emplace(StartAddr, FuncRange());
FuncRange &FRange = R.first->second;

FRange.Func = &Func;
FRange.StartAddress = StartAddr;
FRange.EndAddress = StartAddr + Size;

} else if (SymName != Range->getFuncName() && ShowDetailedWarning) {
// Function already found from DWARF, check consistency between symbol
// table and DWARF.
WithColor::warning() << "Conflicting name for symbol" << Name
<< " at address " << format("%8" PRIx64, StartAddr)
<< ", but the DWARF symbol " << Range->getFuncName()
<< " indicates a starting address at "
<< format("%8" PRIx64, Range->StartAddress) << "\n";
}
}
}

void ProfiledBinary::loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit) {
for (const auto &DieInfo : CompilationUnit.dies()) {
llvm::DWARFDie Die(&CompilationUnit, &DieInfo);
Expand Down
4 changes: 4 additions & 0 deletions llvm/tools/llvm-profgen/ProfiledBinary.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ struct BinaryFunction {
StringRef FuncName;
// End of range is an exclusive bound.
RangesTy Ranges;
bool FromSymtab = false;

uint64_t getFuncSize() {
uint64_t Sum = 0;
Expand Down Expand Up @@ -356,6 +357,9 @@ class ProfiledBinary {
// Create symbol to its start address mapping.
void populateSymbolAddressList(const object::ObjectFile *O);

// Load functions from its symbol table (when DWARF info is missing).
void loadSymbolsFromSymtab(const object::ObjectFile *O);

// A function may be spilt into multiple non-continuous address ranges. We use
// this to set whether start a function range is the real entry of the
// function and also set false to the non-function label.
Expand Down