Skip to content

Commit

Permalink
Improve reachability sets computation (#84204)
Browse files Browse the repository at this point in the history
* Improve reachability sets computation

Two changes to improve the throughput of computing `bbReach` sets:
1. In `fgComputeReachabilitySets()`, iterate over the blocks in reverse
post-order. This leads to fewer outer `do ... while(change)` iterations.
To do this, the `fgDfsReversePostorder()` function which creates the
reverse post-order numbers and ordering was hoisted out of dominator
creation and above reachability computation.
2. Create a `BlockSetOps::UnionDChanged()` function that does a `UnionD`
operation but also returns `true` if the target bitset changed value.

Some additional stats were added under `COUNT_BASIC_BLOCKS` regarding
how many iterations dominators and reachability computations take to
converge.

* Code review feedback
  • Loading branch information
BruceForstall committed Apr 1, 2023
1 parent c6adf4c commit e65ebc9
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 34 deletions.
7 changes: 7 additions & 0 deletions src/coreclr/jit/bitset.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ class BitSetOps

// Destructively modify "bs1" to be the union of "bs1" and "bs2".
static void UnionD(Env env, BitSetType& bs1, BitSetValueArgType bs2);
// Destructively modify "bs1" to be the union of "bs1" and "bs2"; return `true` if `bs1` changed.
static bool UnionDChanged(Env env, BitSetType& bs1, BitSetValueArgType bs2);
// Returns a new BitSet that is the union of "bs1" and "bs2".
static BitSetValueRetType Union(Env env, BitSetValueArgType bs1, BitSetValueArgType bs2);

Expand Down Expand Up @@ -374,6 +376,11 @@ class BitSetOpsWithCounter
BitSetTraits::GetOpCounter(env)->RecordOp(BitSetSupport::BSOP_UnionD);
BSO::UnionD(env, bs1, bs2);
}
static bool UnionDChanged(Env env, BitSetType& bs1, BitSetValueArgType bs2)
{
BitSetTraits::GetOpCounter(env)->RecordOp(BitSetSupport::BSOP_UnionDChanged);
return BSO::UnionDChanged(env, bs1, bs2);
}
static BitSetValueRetType Union(Env env, BitSetValueArgType bs1, BitSetValueArgType bs2)
{
BitSetTraits::GetOpCounter(env)->RecordOp(BitSetSupport::BSOP_Union);
Expand Down
41 changes: 40 additions & 1 deletion src/coreclr/jit/bitsetasshortlong.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class BitSetOps</*BitSetType*/ BitSetShortLongRep,
static unsigned CountLong(Env env, BitSetShortLongRep bs);
static bool IsEmptyUnionLong(Env env, BitSetShortLongRep bs1, BitSetShortLongRep bs2);
static void UnionDLong(Env env, BitSetShortLongRep& bs1, BitSetShortLongRep bs2);
static bool UnionDLongChanged(Env env, BitSetShortLongRep& bs1, BitSetShortLongRep bs2);
static void DiffDLong(Env env, BitSetShortLongRep& bs1, BitSetShortLongRep bs2);
static void AddElemDLong(Env env, BitSetShortLongRep& bs, unsigned i);
static bool TryAddElemDLong(Env env, BitSetShortLongRep& bs, unsigned i);
Expand Down Expand Up @@ -209,6 +210,23 @@ class BitSetOps</*BitSetType*/ BitSetShortLongRep,
UnionDLong(env, bs1, bs2);
}
}

static bool UnionDChanged(Env env, BitSetShortLongRep& bs1, BitSetShortLongRep bs2)
{
if (IsShort(env))
{
size_t bsCurrent = (size_t)bs1;
size_t bsNew = bsCurrent | ((size_t)bs2);
bool changed = bsNew != bsCurrent;
bs1 = (BitSetShortLongRep)bsNew;
return changed;
}
else
{
return UnionDLongChanged(env, bs1, bs2);
}
}

static BitSetShortLongRep Union(Env env, BitSetShortLongRep bs1, BitSetShortLongRep bs2)
{
BitSetShortLongRep res = MakeCopy(env, bs1);
Expand Down Expand Up @@ -443,7 +461,7 @@ class BitSetOps</*BitSetType*/ BitSetShortLongRep,
{
if (IsShort(env))
{
// Can't just shift by numBits+1, since that might be 32 (and (1 << 32( == 1, for an unsigned).
// Can't just shift by numBits+1, since that might be 32 (and (1 << 32) == 1, for an unsigned).
unsigned numBits = BitSetTraits::GetSize(env);
if (numBits == BitsInSizeT)
{
Expand Down Expand Up @@ -638,6 +656,27 @@ void BitSetOps</*BitSetType*/ BitSetShortLongRep,
}
}

template <typename Env, typename BitSetTraits>
bool BitSetOps</*BitSetType*/ BitSetShortLongRep,
/*Brand*/ BSShortLong,
/*Env*/ Env,
/*BitSetTraits*/ BitSetTraits>::UnionDLongChanged(Env env,
BitSetShortLongRep& bs1,
BitSetShortLongRep bs2)
{
assert(!IsShort(env));
bool changed = false;
unsigned len = BitSetTraits::GetArrSize(env);
for (unsigned i = 0; i < len; i++)
{
size_t bsCurrent = bs1[i];
size_t bsNew = bsCurrent | bs2[i];
changed |= bsNew != bsCurrent;
bs1[i] = bsNew;
}
return changed;
}

template <typename Env, typename BitSetTraits>
void BitSetOps</*BitSetType*/ BitSetShortLongRep,
/*Brand*/ BSShortLong,
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/bitsetops.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ BSOPNAME(BSOP_RemoveElem)
BSOPNAME(BSOP_AddElemD)
BSOPNAME(BSOP_AddElem)
BSOPNAME(BSOP_UnionD)
BSOPNAME(BSOP_UnionDChanged)
BSOPNAME(BSOP_Union)
BSOPNAME(BSOP_IntersectionD)
BSOPNAME(BSOP_Intersection)
Expand Down
28 changes: 28 additions & 0 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,15 @@ Histogram bbCntTable(bbCntBuckets);
unsigned bbSizeBuckets[] = {1, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 0};
Histogram bbOneBBSizeTable(bbSizeBuckets);

unsigned domsChangedIterationBuckets[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0};
Histogram domsChangedIterationTable(domsChangedIterationBuckets);

unsigned computeReachabilitySetsIterationBuckets[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0};
Histogram computeReachabilitySetsIterationTable(computeReachabilitySetsIterationBuckets);

unsigned computeReachabilityIterationBuckets[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0};
Histogram computeReachabilityIterationTable(computeReachabilityIterationBuckets);

#endif // COUNT_BASIC_BLOCKS

/*****************************************************************************
Expand Down Expand Up @@ -1560,6 +1569,25 @@ void Compiler::compShutdown()
fprintf(fout, "--------------------------------------------------\n");
bbOneBBSizeTable.dump(fout);
fprintf(fout, "--------------------------------------------------\n");

fprintf(fout, "--------------------------------------------------\n");
fprintf(fout, "fgComputeDoms `while (change)` iterations:\n");
fprintf(fout, "--------------------------------------------------\n");
domsChangedIterationTable.dump(fout);
fprintf(fout, "--------------------------------------------------\n");

fprintf(fout, "--------------------------------------------------\n");
fprintf(fout, "fgComputeReachabilitySets `while (change)` iterations:\n");
fprintf(fout, "--------------------------------------------------\n");
computeReachabilitySetsIterationTable.dump(fout);
fprintf(fout, "--------------------------------------------------\n");

fprintf(fout, "--------------------------------------------------\n");
fprintf(fout, "fgComputeReachability `while (change)` iterations:\n");
fprintf(fout, "--------------------------------------------------\n");
computeReachabilityIterationTable.dump(fout);
fprintf(fout, "--------------------------------------------------\n");

#endif // COUNT_BASIC_BLOCKS

#if COUNT_LOOPS
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -11691,6 +11691,9 @@ extern size_t gcPtrMapNSize;
#if COUNT_BASIC_BLOCKS
extern Histogram bbCntTable;
extern Histogram bbOneBBSizeTable;
extern Histogram domsChangedIterationTable;
extern Histogram computeReachabilitySetsIterationTable;
extern Histogram computeReachabilityIterationTable;
#endif

/*****************************************************************************
Expand Down
72 changes: 40 additions & 32 deletions src/coreclr/jit/fgopt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ void Compiler::fgUpdateChangedFlowGraph(FlowGraphUpdates updates)
JITDUMP("\nRenumbering the basic blocks for fgUpdateChangeFlowGraph\n");
fgRenumberBlocks();
fgComputeEnterBlocksSet();
fgDfsReversePostorder();
fgComputeReachabilitySets();
if (computeDoms)
{
Expand All @@ -219,18 +220,18 @@ void Compiler::fgUpdateChangedFlowGraph(FlowGraphUpdates updates)
//
// This also sets the BBF_GC_SAFE_POINT flag on blocks.
//
// This depends on `fgBBReversePostorder` being correct.
//
// TODO-Throughput: This algorithm consumes O(n^2) because we're using dense bitsets to
// represent reachability. While this yields O(1) time queries, it bloats the memory usage
// for large code. We can do better if we try to approach reachability by
// computing the strongly connected components of the flow graph. That way we only need
// linear memory to label every block with its SCC.
//
// Assumptions:
// Assumes the predecessor lists are correct.
//
void Compiler::fgComputeReachabilitySets()
{
assert(fgPredsComputed);
assert(fgBBReversePostorder != nullptr);

#ifdef DEBUG
fgReachabilitySetsValid = false;
Expand All @@ -250,21 +251,21 @@ void Compiler::fgComputeReachabilitySets()
// Find the reachable blocks. Also, set BBF_GC_SAFE_POINT.

bool change;
BlockSet newReach(BlockSetOps::MakeEmpty(this));
unsigned changedIterCount = 1;
do
{
change = false;

for (BasicBlock* const block : Blocks())
for (unsigned i = 1; i <= fgBBNumMax; ++i)
{
BlockSetOps::Assign(this, newReach, block->bbReach);
BasicBlock* const block = fgBBReversePostorder[i];

bool predGcSafe = (block->bbPreds != nullptr); // Do all of our predecessor blocks have a GC safe bit?

for (BasicBlock* const predBlock : block->PredBlocks())
{
/* Union the predecessor's reachability set into newReach */
BlockSetOps::UnionD(this, newReach, predBlock->bbReach);
change |= BlockSetOps::UnionDChanged(this, block->bbReach, predBlock->bbReach);

if (!(predBlock->bbFlags & BBF_GC_SAFE_POINT))
{
Expand All @@ -276,15 +277,15 @@ void Compiler::fgComputeReachabilitySets()
{
block->bbFlags |= BBF_GC_SAFE_POINT;
}

if (!BlockSetOps::Equal(this, newReach, block->bbReach))
{
BlockSetOps::Assign(this, block->bbReach, newReach);
change = true;
}
}

++changedIterCount;
} while (change);

#if COUNT_BASIC_BLOCKS
computeReachabilitySetsIterationTable.record(changedIterCount);
#endif // COUNT_BASIC_BLOCKS

#ifdef DEBUG
if (verbose)
{
Expand Down Expand Up @@ -598,15 +599,11 @@ PhaseStatus Compiler::fgComputeReachability()
madeChanges |= fgRenumberBlocks();

//
// Compute fgEnterBlks
// Compute fgEnterBlks, reverse post-order, and bbReach.
//

fgComputeEnterBlocksSet();

//
// Compute bbReach
//

fgDfsReversePostorder();
fgComputeReachabilitySets();

//
Expand All @@ -618,6 +615,10 @@ PhaseStatus Compiler::fgComputeReachability()

} while (changed);

#if COUNT_BASIC_BLOCKS
computeReachabilityIterationTable.record(passNum - 1);
#endif // COUNT_BASIC_BLOCKS

//
// Now, compute the dominators
//
Expand Down Expand Up @@ -752,19 +753,24 @@ bool Compiler::fgRemoveDeadBlocks()
// preorder and reverse postorder numbers, plus a reverse postorder for blocks.
//
// Notes:
// Assumes caller has computed the fgEnterBlkSet.
// Assumes caller has computed the fgEnterBlks set.
//
// Each block's `bbPreorderNum` and `bbPostorderNum` is set.
//
// Assumes caller has allocated the fgBBReversePostorder array.
// It will be filled in with blocks in reverse post order.
// The `fgBBReversePostorder` array is filled in with the `BasicBlock*` in reverse post-order.
//
// This algorithm only pays attention to the actual blocks. It ignores any imaginary entry block.
//
void Compiler::fgDfsReversePostorder()
{
assert(fgBBcount == fgBBNumMax);
assert(BasicBlockBitSetTraits::GetSize(this) == fgBBNumMax + 1);

// Make sure fgEnterBlks are still there in startNodes, even if they participate in a loop (i.e., there is
// an incoming edge into the block).
assert(fgEnterBlksSetValid);
assert(fgBBReversePostorder != nullptr);

fgBBReversePostorder = new (this, CMK_DominatorMemory) BasicBlock*[fgBBNumMax + 1]{};

// visited : Once we run the DFS post order sort recursive algorithm, we mark the nodes we visited to avoid
// backtracking.
Expand Down Expand Up @@ -989,13 +995,6 @@ void Compiler::fgComputeDoms()
assert(BasicBlockBitSetTraits::GetSize(this) == fgBBNumMax + 1);
#endif // DEBUG

BlockSet processedBlks(BlockSetOps::MakeEmpty(this));

fgBBReversePostorder = new (this, CMK_DominatorMemory) BasicBlock*[fgBBNumMax + 1]{};

fgDfsReversePostorder();
noway_assert(fgBBReversePostorder[0] == nullptr);

// flRoot and bbRoot represent an imaginary unique entry point in the flow graph.
// All the orphaned EH blocks and fgFirstBB will temporarily have its predecessors list
// (with bbRoot as the only basic block in it) set as flRoot.
Expand All @@ -1012,9 +1011,11 @@ void Compiler::fgComputeDoms()

FlowEdge flRoot(&bbRoot, nullptr);

noway_assert(fgBBReversePostorder[0] == nullptr);
fgBBReversePostorder[0] = &bbRoot;

// Mark both bbRoot and fgFirstBB processed
BlockSet processedBlks(BlockSetOps::MakeEmpty(this));
BlockSetOps::AddElemD(this, processedBlks, 0); // bbRoot == block #0
BlockSetOps::AddElemD(this, processedBlks, 1); // fgFirstBB == block #1
assert(fgFirstBB->bbNum == 1);
Expand Down Expand Up @@ -1057,7 +1058,8 @@ void Compiler::fgComputeDoms()
}

// Now proceed to compute the immediate dominators for each basic block.
bool changed = true;
bool changed = true;
unsigned changedIterCount = 1;
while (changed)
{
changed = false;
Expand Down Expand Up @@ -1116,8 +1118,14 @@ void Compiler::fgComputeDoms()
}
BlockSetOps::AddElemD(this, processedBlks, block->bbNum);
}

++changedIterCount;
}

#if COUNT_BASIC_BLOCKS
domsChangedIterationTable.record(changedIterCount);
#endif // COUNT_BASIC_BLOCKS

// As stated before, once we have computed immediate dominance we need to clear
// all the basic blocks whose predecessor list was set to flRoot. This
// reverts that and leaves the blocks the same as before.
Expand Down Expand Up @@ -1311,7 +1319,7 @@ void Compiler::fgNumberDomTree(DomTreeNode* domTree)
// fgIntersectDom: Intersect two immediate dominator sets.
//
// Find the lowest common ancestor in the dominator tree between two basic blocks. The LCA in the dominance tree
// represents the closest dominator between the two basic blocks. Used to adjust the IDom value in fgComputDoms.
// represents the closest dominator between the two basic blocks. Used to adjust the IDom value in fgComputeDoms.
//
// Arguments:
// a, b - two blocks to intersect
Expand Down
1 change: 0 additions & 1 deletion src/coreclr/jit/fgprofilesynthesis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,6 @@ void ProfileSynthesis::RandomizeLikelihoods()
void ProfileSynthesis::BuildReversePostorder()
{
m_comp->EnsureBasicBlockEpoch();
m_comp->fgBBReversePostorder = new (m_comp, CMK_Pgo) BasicBlock*[m_comp->fgBBNumMax + 1]{};
m_comp->fgComputeEnterBlocksSet();
m_comp->fgDfsReversePostorder();

Expand Down

0 comments on commit e65ebc9

Please sign in to comment.