-
Couldn't load subscription status.
- Fork 15k
[BOLT] Move call probe information to CallSiteInfo #165490
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/aaupov/spr/main.bolt-move-call-probe-information-to-callsiteinfo
Are you sure you want to change the base?
Conversation
Created using spr 1.3.4
Created using spr 1.3.4
| @llvm/pr-subscribers-bolt Author: Amir Ayupov (aaupov) ChangesPseudo probe matching (#100446) needs callee information for call probes.
Probe information can be further compressed (to be added in a follow-up The size increase is modest, at ~3% for a XL profile (461->475MB), Test Plan: updated pseudoprobe-decoding-{inline,noinline}.test Full diff: https://github.com/llvm/llvm-project/pull/165490.diff 7 Files Affected:
diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h index 41e2bd1651efd..b393c85321b7d 100644 --- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h +++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h @@ -29,6 +29,10 @@ struct CallSiteInfo { uint32_t EntryDiscriminator{0}; /// multiple entry discriminator uint64_t Count{0}; uint64_t Mispreds{0}; + // Pseudo probe information, optional + uint32_t Probe{0}; + bool Indirect = false; + uint32_t InlineTreeNode{0}; bool operator==(const CallSiteInfo &Other) const { return Offset == Other.Offset && DestId == Other.DestId && @@ -63,6 +67,9 @@ template <> struct MappingTraits<bolt::CallSiteInfo> { YamlIO.mapOptional("disc", CSI.EntryDiscriminator, (uint32_t)0); YamlIO.mapRequired("cnt", CSI.Count); YamlIO.mapOptional("mis", CSI.Mispreds, (uint64_t)0); + YamlIO.mapOptional("pp", CSI.Probe, 0); + YamlIO.mapOptional("ppn", CSI.InlineTreeNode, 0); + YamlIO.mapOptional("ind", CSI.Indirect, false); } static const bool flow = true; @@ -95,29 +102,20 @@ template <> struct MappingTraits<bolt::SuccessorInfo> { namespace bolt { struct PseudoProbeInfo { - uint32_t InlineTreeIndex = 0; - uint64_t BlockMask = 0; // bitset with probe indices from 1 to 64 - std::vector<uint64_t> BlockProbes; // block probes with indices above 64 - std::vector<uint64_t> CallProbes; - std::vector<uint64_t> IndCallProbes; + std::vector<uint64_t> BlockProbes; std::vector<uint32_t> InlineTreeNodes; bool operator==(const PseudoProbeInfo &Other) const { - return InlineTreeIndex == Other.InlineTreeIndex && - BlockProbes == Other.BlockProbes && CallProbes == Other.CallProbes && - IndCallProbes == Other.IndCallProbes; + return InlineTreeNodes == Other.InlineTreeNodes && + BlockProbes == Other.BlockProbes; } }; } // end namespace bolt template <> struct MappingTraits<bolt::PseudoProbeInfo> { static void mapping(IO &YamlIO, bolt::PseudoProbeInfo &PI) { - YamlIO.mapOptional("blx", PI.BlockMask, 0); - YamlIO.mapOptional("blk", PI.BlockProbes, std::vector<uint64_t>()); - YamlIO.mapOptional("call", PI.CallProbes, std::vector<uint64_t>()); - YamlIO.mapOptional("icall", PI.IndCallProbes, std::vector<uint64_t>()); - YamlIO.mapOptional("id", PI.InlineTreeIndex, 0); - YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector<uint32_t>()); + YamlIO.mapOptional("blk", PI.BlockProbes, std::vector<uint64_t>(1, 1)); + YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector<uint32_t>(1, 0)); } static const bool flow = true; diff --git a/bolt/include/bolt/Profile/YAMLProfileWriter.h b/bolt/include/bolt/Profile/YAMLProfileWriter.h index d4d7217464cc8..50ee78d342df8 100644 --- a/bolt/include/bolt/Profile/YAMLProfileWriter.h +++ b/bolt/include/bolt/Profile/YAMLProfileWriter.h @@ -74,25 +74,24 @@ class YAMLProfileWriter { collectInlineTree(const MCPseudoProbeDecoder &Decoder, const MCDecodedPseudoProbeInlineTree &Root); - // 0 - block probe, 1 - indirect call, 2 - direct call - using ProbeList = std::array<SmallVector<uint64_t, 0>, 3>; - using NodeIdToProbes = DenseMap<uint32_t, ProbeList>; - static std::vector<yaml::bolt::PseudoProbeInfo> - convertNodeProbes(NodeIdToProbes &NodeProbes); - public: - template <typename T> - static std::vector<yaml::bolt::PseudoProbeInfo> - writeBlockProbes(T Probes, const InlineTreeMapTy &InlineTreeNodeId) { - NodeIdToProbes NodeProbes; - for (const MCDecodedPseudoProbe &Probe : Probes) { - auto It = InlineTreeNodeId.find(Probe.getInlineTreeNode()); - if (It == InlineTreeNodeId.end()) - continue; - NodeProbes[It->second][Probe.getType()].emplace_back(Probe.getIndex()); - } - return convertNodeProbes(NodeProbes); - } + class BlockProbeCtx { + struct Call { + uint64_t Id; + uint32_t Node; + bool Indirect; + bool Used; + }; + // Group block probes by node id. + DenseMap<uint32_t, std::vector<uint64_t>> NodeToProbes; + // Offset -> call probe + DenseMap<uint32_t, Call> CallProbes; + + public: + void addBlockProbe(const InlineTreeMapTy &Map, + const MCDecodedPseudoProbe &Probe, uint32_t ProbeOffset); + void finalize(yaml::bolt::BinaryBasicBlockProfile &YamlBB); + }; }; } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 3604fdd3a94b4..6a12efdcb96c0 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -2383,10 +2383,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, PseudoProbeDecoder->getAddress2ProbesMap(); BinaryFunction::FragmentsSetTy Fragments(BF->Fragments); Fragments.insert(BF); - DenseMap< - uint32_t, - std::vector<std::reference_wrapper<const MCDecodedPseudoProbe>>> - BlockProbes; + DenseMap<uint32_t, YAMLProfileWriter::BlockProbeCtx> BlockCtx; for (const BinaryFunction *F : Fragments) { const uint64_t FuncAddr = F->getAddress(); for (const MCDecodedPseudoProbe &Probe : @@ -2394,15 +2391,14 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, const uint32_t OutputAddress = Probe.getAddress(); const uint32_t InputOffset = BAT->translate( FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true); - const unsigned BlockIndex = getBlock(InputOffset).second; - BlockProbes[BlockIndex].emplace_back(Probe); + const auto &[BlockOffset, BlockIndex] = getBlock(InputOffset); + BlockCtx[BlockIndex].addBlockProbe(InlineTreeNodeId, Probe, + InputOffset - BlockOffset); } } - for (auto &[Block, Probes] : BlockProbes) { - YamlBF.Blocks[Block].PseudoProbes = - YAMLProfileWriter::writeBlockProbes(Probes, InlineTreeNodeId); - } + for (auto &[Block, Ctx] : BlockCtx) + Ctx.finalize(YamlBF.Blocks[Block]); } // Skip printing if there's no profile data llvm::erase_if( diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index 1a61949d77472..5fb65153cf313 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -348,26 +348,10 @@ class StaleMatcher { return It->second; }; - auto matchPseudoProbeInfo = [&](const yaml::bolt::PseudoProbeInfo - &ProfileProbe, - uint32_t NodeId) { - for (uint64_t Index = 0; Index < 64; ++Index) - if (ProfileProbe.BlockMask & 1ull << Index) - ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, Index + 1)]; - for (const auto &ProfileProbes : - {ProfileProbe.BlockProbes, ProfileProbe.IndCallProbes, - ProfileProbe.CallProbes}) - for (uint64_t ProfileProbe : ProfileProbes) - ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, ProfileProbe)]; - }; - - for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) { - if (!ProfileProbe.InlineTreeNodes.empty()) - for (uint32_t ProfileInlineTreeNode : ProfileProbe.InlineTreeNodes) - matchPseudoProbeInfo(ProfileProbe, ProfileInlineTreeNode); - else - matchPseudoProbeInfo(ProfileProbe, ProfileProbe.InlineTreeIndex); - } + for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) + for (uint32_t Node : ProfileProbe.InlineTreeNodes) + for (uint64_t Probe : ProfileProbe.BlockProbes) + ++FlowBlockMatchCount[matchProfileProbeToBlock(Node, Probe)]; uint32_t BestMatchCount = 0; uint32_t TotalMatchCount = 0; const FlowBlock *BestMatchBlock = nullptr; diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp index 1632aa1c6bfe2..82f2114a7a3d2 100644 --- a/bolt/lib/Profile/YAMLProfileWriter.cpp +++ b/bolt/lib/Profile/YAMLProfileWriter.cpp @@ -129,50 +129,62 @@ YAMLProfileWriter::convertPseudoProbeDesc(const MCPseudoProbeDecoder &Decoder) { return {Desc, InlineTree}; } -std::vector<yaml::bolt::PseudoProbeInfo> -YAMLProfileWriter::convertNodeProbes(NodeIdToProbes &NodeProbes) { - struct BlockProbeInfoHasher { - size_t operator()(const yaml::bolt::PseudoProbeInfo &BPI) const { - return llvm::hash_combine(llvm::hash_combine_range(BPI.BlockProbes), - llvm::hash_combine_range(BPI.CallProbes), - llvm::hash_combine_range(BPI.IndCallProbes)); +void YAMLProfileWriter::BlockProbeCtx::addBlockProbe( + const InlineTreeMapTy &Map, const MCDecodedPseudoProbe &Probe, + uint32_t ProbeOffset) { + auto It = Map.find(Probe.getInlineTreeNode()); + if (It == Map.end()) + return; + auto NodeId = It->second; + uint32_t Index = Probe.getIndex(); + if (Probe.isCall()) + CallProbes[ProbeOffset] = + Call{Index, NodeId, Probe.isIndirectCall(), false}; + else + NodeToProbes[NodeId].emplace_back(Index); +} + +void YAMLProfileWriter::BlockProbeCtx::finalize( + yaml::bolt::BinaryBasicBlockProfile &YamlBB) { + // Hash block probes by vector + struct ProbeHasher { + size_t operator()(const ArrayRef<uint64_t> Probes) const { + return llvm::hash_combine_range(Probes); } }; - // Check identical BlockProbeInfo structs and merge them - std::unordered_map<yaml::bolt::PseudoProbeInfo, std::vector<uint32_t>, - BlockProbeInfoHasher> - BPIToNodes; - for (auto &[NodeId, Probes] : NodeProbes) { - yaml::bolt::PseudoProbeInfo BPI; - BPI.BlockProbes = std::vector(Probes[0].begin(), Probes[0].end()); - BPI.IndCallProbes = std::vector(Probes[1].begin(), Probes[1].end()); - BPI.CallProbes = std::vector(Probes[2].begin(), Probes[2].end()); - BPIToNodes[BPI].push_back(NodeId); + // Check identical block probes and merge them + std::unordered_map<std::vector<uint64_t>, std::vector<uint32_t>, ProbeHasher> + ProbesToNodes; + for (auto &[NodeId, Probes] : NodeToProbes) { + llvm::sort(Probes); + ProbesToNodes[Probes].emplace_back(NodeId); } - - auto handleMask = [](const auto &Ids, auto &Vec, auto &Mask) { - for (auto Id : Ids) - if (Id > 64) - Vec.emplace_back(Id); - else - Mask |= 1ull << (Id - 1); - }; - - // Add to YAML with merged nodes/block mask optimizations - std::vector<yaml::bolt::PseudoProbeInfo> YamlProbes; - YamlProbes.reserve(BPIToNodes.size()); - for (const auto &[BPI, Nodes] : BPIToNodes) { - auto &YamlBPI = YamlProbes.emplace_back(yaml::bolt::PseudoProbeInfo()); - YamlBPI.CallProbes = BPI.CallProbes; - YamlBPI.IndCallProbes = BPI.IndCallProbes; - if (Nodes.size() == 1) - YamlBPI.InlineTreeIndex = Nodes.front(); - else - YamlBPI.InlineTreeNodes = Nodes; - handleMask(BPI.BlockProbes, YamlBPI.BlockProbes, YamlBPI.BlockMask); + for (auto &[Probes, Nodes] : ProbesToNodes) { + llvm::sort(Nodes); + YamlBB.PseudoProbes.emplace_back( + yaml::bolt::PseudoProbeInfo{Probes, Nodes}); + } + for (yaml::bolt::CallSiteInfo &CSI : YamlBB.CallSites) { + auto It = CallProbes.find(CSI.Offset); + if (It == CallProbes.end()) + continue; + Call &Probe = It->second; + CSI.Probe = Probe.Id; + CSI.InlineTreeNode = Probe.Node; + CSI.Indirect = Probe.Indirect; + Probe.Used = true; + } + for (const auto &[Offset, Probe] : CallProbes) { + if (Probe.Used) + continue; + yaml::bolt::CallSiteInfo CSI; + CSI.Offset = Offset; + CSI.Probe = Probe.Id; + CSI.InlineTreeNode = Probe.Node; + CSI.Indirect = Probe.Indirect; + YamlBB.CallSites.emplace_back(CSI); } - return YamlProbes; } std::tuple<std::vector<yaml::bolt::InlineTreeNode>, @@ -343,12 +355,13 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS, const AddressProbesMap &ProbeMap = PseudoProbeDecoder->getAddress2ProbesMap(); const uint64_t FuncAddr = BF.getAddress(); - const std::pair<uint64_t, uint64_t> &BlockRange = - BB->getInputAddressRange(); - const std::pair<uint64_t, uint64_t> BlockAddrRange = { - FuncAddr + BlockRange.first, FuncAddr + BlockRange.second}; - auto Probes = ProbeMap.find(BlockAddrRange.first, BlockAddrRange.second); - YamlBB.PseudoProbes = writeBlockProbes(Probes, InlineTreeNodeId); + auto [Start, End] = BB->getInputAddressRange(); + Start += FuncAddr; + End += FuncAddr; + BlockProbeCtx Ctx; + for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(Start, End)) + Ctx.addBlockProbe(InlineTreeNodeId, Probe, Probe.getAddress() - Start); + Ctx.finalize(YamlBB); } YamlBF.Blocks.emplace_back(YamlBB); diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test index e5e8aadc18f9e..9748fc1b6a4d4 100644 --- a/bolt/test/X86/pseudoprobe-decoding-inline.test +++ b/bolt/test/X86/pseudoprobe-decoding-inline.test @@ -14,17 +14,17 @@ # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML # CHECK-YAML: name: bar # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 9 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 4 ] } ] # CHECK-YAML: inline_tree: [ { } ] # # CHECK-YAML: name: foo # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ] } ] # CHECK-YAML: inline_tree: [ { g: 1 }, { g: 0, cs: 8 } ] # # CHECK-YAML: name: main # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3, id: 1 }, { blx: 1 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ], ids: [ 1 ] }, { } ] # CHECK-YAML: inline_tree: [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ] # # CHECK-YAML: pseudo_probe_desc: diff --git a/bolt/test/X86/pseudoprobe-decoding-noinline.test b/bolt/test/X86/pseudoprobe-decoding-noinline.test index 36a2fab74e857..4ba51cdc96f9e 100644 --- a/bolt/test/X86/pseudoprobe-decoding-noinline.test +++ b/bolt/test/X86/pseudoprobe-decoding-noinline.test @@ -15,17 +15,18 @@ # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML # CHECK-YAML: name: bar # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 9 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 4 ] } ] # CHECK-YAML: inline_tree: [ { } ] # # CHECK-YAML: name: foo # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 3 } ] +# CHECK-YAML: probes: [ { blk: [ 1, 2 ] } ] # CHECK-YAML: inline_tree: [ { g: 2 } ] # # CHECK-YAML: name: main # CHECK-YAML: - bid: 0 -# CHECK-YAML: probes: [ { blx: 1, call: [ 2 ] } ] +# CHECK-YAML: calls: [ { off: 0x4, fid: 0, cnt: 0, pp: 2 } ] +# CHECK-YAML: probes: [ { } ] # CHECK-YAML: inline_tree: [ { g: 1 } ] # # CHECK-YAML: pseudo_probe_desc: |
Created using spr 1.3.4
Pseudo probe matching (#100446) needs callee information for call probes.
Embed call probe information (probe id, inline tree node, indirect flag)
into CallSiteInfo. As a consequence:
it only contain block probes.
to unambiguously elide block id 1 (common case).
Block mask (blx) encoding becomes a low-ROI optimization and will be
replaced by a more compact encoding leveraging simplified PseudoProbeInfo
in a follow-up change.
The size increase is ~3% for an XL profile (461->475MB). Compact block
probe encoding shrinks it by ~6%.
Test Plan: updated pseudoprobe-decoding-{inline,noinline}.test