electron/patches/v8/turboshaft_avoid_introducing_too_many_variables.patch

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

475 lines
22 KiB
Diff
Raw Normal View History

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Darius Mercadier <dmercadier@chromium.org>
Date: Wed, 5 Nov 2025 14:06:54 +0100
Subject: [turboshaft] Avoid introducing too many Variables
.... if we have very large merges.
Cf https://crbug.com/418027512#comment5 for explanations of why this
is necessary (and the following comment for why I don't see a good
alternative to this CL).
I've locally confirmed that this fixes the OOM from
https://crbug.com/457625181, and it reduces memory consumption on
binaries/crbug-40219016-zelda/zelda.wasm (from
https://crbug.com/418027512) by 20+%.
Bug: 418027512, 457625181
Change-Id: If55af659667723ce85ff71bcac66a43aff863e05
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/7119378
Commit-Queue: Darius Mercadier <dmercadier@chromium.org>
Auto-Submit: Darius Mercadier <dmercadier@chromium.org>
Reviewed-by: Matthias Liedtke <mliedtke@chromium.org>
Cr-Commit-Position: refs/heads/main@{#103534}
diff --git a/src/compiler/turboshaft/branch-elimination-reducer.h b/src/compiler/turboshaft/branch-elimination-reducer.h
index f115c86894f0cf739d6381f7844e5589831cc209..d917d27bd3964ba07b41efa49b86435ae7720064 100644
--- a/src/compiler/turboshaft/branch-elimination-reducer.h
+++ b/src/compiler/turboshaft/branch-elimination-reducer.h
@@ -323,6 +323,10 @@ class BranchEliminationReducer : public Next {
goto no_change;
}
+ if (!__ CanCreateNVariables(destination_origin->OpCountUpperBound())) {
+ goto no_change;
+ }
+
if (const BranchOp* branch = last_op.template TryCast<BranchOp>()) {
V<Word32> condition =
__ template MapToNewGraph<true>(branch->condition());
diff --git a/src/compiler/turboshaft/copying-phase.h b/src/compiler/turboshaft/copying-phase.h
index 875861d005435b1c2a1591886c053ca360c3e2f2..b43958499d5b6d6e72b81d965d0729bb213c7ae6 100644
--- a/src/compiler/turboshaft/copying-phase.h
+++ b/src/compiler/turboshaft/copying-phase.h
@@ -714,9 +714,23 @@ class GraphVisitor : public OutputGraphAssembler<GraphVisitor<AfterNext>,
if (Asm().CanAutoInlineBlocksWithSinglePredecessor() &&
terminator.Is<GotoOp>()) {
Block* destination = terminator.Cast<GotoOp>().destination;
- if (destination->PredecessorCount() == 1) {
- block_to_inline_now_ = destination;
- return;
+ // Inlining the destination will require setting it in needs_variables_
+ // mode; we thus check that we can actually create enough variables to do
+ // this.
+ // TODO(dmercadier): in practice, the only reason we need variables for
+ // the destination is because we could be currently in a phase that cloned
+ // the current block, which could lead to {destination} being cloned as
+ // well. No all phases can do this, so we could check that we're not in
+ // such a phase, and if so, not use variables for the destination. One way
+ // to do this would be to have a DisallowCloningReducer which would
+ // static_assert that LoopUnrolling/LoopPeeling/BranchElimination aren't
+ // on the stack and would also prevent using CloneSubGraph,
+ // CloneAndInlineBlock and CloneBlockAndGoto.
+ if (Asm().CanCreateNVariables(destination->OpCountUpperBound())) {
+ if (destination->PredecessorCount() == 1) {
+ block_to_inline_now_ = destination;
+ return;
+ }
}
}
// Just going through the regular VisitOp function.
diff --git a/src/compiler/turboshaft/graph.h b/src/compiler/turboshaft/graph.h
index 936c8b0269a9b87a4ffa20c40bbd908fb8c69010..a3c1c40e4e7097f518e107d85786c7cc5466e595 100644
--- a/src/compiler/turboshaft/graph.h
+++ b/src/compiler/turboshaft/graph.h
@@ -608,6 +608,7 @@ class Graph {
operation_origins_.Reset();
operation_types_.Reset();
dominator_tree_depth_ = 0;
+ max_merge_pred_count_ = 0;
#ifdef DEBUG
block_type_refinement_.Reset();
// Do not reset of graph_created_from_turbofan_ as it is propagated along
@@ -791,6 +792,8 @@ class Graph {
bound_blocks_.push_back(block);
uint32_t depth = block->ComputeDominator();
dominator_tree_depth_ = std::max<uint32_t>(dominator_tree_depth_, depth);
+ max_merge_pred_count_ =
+ std::max<uint32_t>(max_merge_pred_count_, block->PredecessorCount());
#ifdef DEBUG
if (v8_flags.turboshaft_trace_emitted) {
@@ -1016,6 +1019,8 @@ class Graph {
uint32_t DominatorTreeDepth() const { return dominator_tree_depth_; }
+ uint32_t max_merge_pred_count() const { return max_merge_pred_count_; }
+
const GrowingOpIndexSidetable<Type>& operation_types() const {
return operation_types_;
}
@@ -1068,6 +1073,7 @@ class Graph {
std::swap(next_block_, companion.next_block_);
std::swap(block_permutation_, companion.block_permutation_);
std::swap(graph_zone_, companion.graph_zone_);
+ std::swap(max_merge_pred_count_, companion.max_merge_pred_count_);
op_to_block_.SwapData(companion.op_to_block_);
source_positions_.SwapData(companion.source_positions_);
operation_origins_.SwapData(companion.operation_origins_);
@@ -1206,6 +1212,9 @@ class Graph {
GrowingOpIndexSidetable<SourcePosition> source_positions_;
GrowingOpIndexSidetable<OpIndex> operation_origins_;
uint32_t dominator_tree_depth_ = 0;
+ // {max_merge_pred_count_} stores the maximum number of predecessors that any
+ // Merge in the graph has.
+ uint32_t max_merge_pred_count_ = 0;
GrowingOpIndexSidetable<Type> operation_types_;
#ifdef DEBUG
GrowingBlockSidetable<TypeRefinements> block_type_refinement_;
diff --git a/src/compiler/turboshaft/loop-peeling-reducer.h b/src/compiler/turboshaft/loop-peeling-reducer.h
index a9b5eaaf4c88354164b3a5833d4bd6b2760b12a0..b7df7acb61d048669a2cacfbc4e2156df69788dc 100644
--- a/src/compiler/turboshaft/loop-peeling-reducer.h
+++ b/src/compiler/turboshaft/loop-peeling-reducer.h
@@ -57,8 +57,7 @@ class LoopPeelingReducer : public Next {
const Block* dst = gto.destination;
if (dst->IsLoop() && !gto.is_backedge && CanPeelLoop(dst)) {
if (ShouldSkipOptimizationStep()) goto no_change;
- PeelFirstIteration(dst);
- return {};
+ if (PeelFirstIteration(dst)) return {};
} else if (IsEmittingPeeledIteration() && dst == current_loop_header_) {
// We skip the backedge of the loop: PeelFirstIeration will instead emit a
// forward edge to the non-peeled header.
@@ -111,13 +110,21 @@ class LoopPeelingReducer : public Next {
kEmittingUnpeeledBody
};
- void PeelFirstIteration(const Block* header) {
+ bool PeelFirstIteration(const Block* header) {
TRACE("LoopPeeling: peeling loop at " << header->index());
DCHECK_EQ(peeling_, PeelingStatus::kNotPeeling);
ScopedModification<PeelingStatus> scope(&peeling_,
PeelingStatus::kEmittingPeeledLoop);
current_loop_header_ = header;
+ constexpr int kNumberOfLoopCopies = 2; // peeled + unpeeled
+ size_t op_count_upper_bound =
+ loop_finder_.GetLoopInfo(header).op_count * kNumberOfLoopCopies;
+ if (!__ CanCreateNVariables(op_count_upper_bound)) {
+ TRACE("> Too many variables, skipping peeling");
+ return false;
+ }
+
// Emitting the peeled iteration.
auto loop_body = loop_finder_.GetLoopBody(header);
// Note that this call to CloneSubGraph will not emit the backedge because
@@ -133,7 +140,7 @@ class LoopPeelingReducer : public Next {
// While peeling, we realized that the 2nd iteration of the loop is not
// reachable.
TRACE("> Second iteration is not reachable, stopping now");
- return;
+ return true;
}
// We now emit the regular unpeeled loop.
@@ -141,6 +148,7 @@ class LoopPeelingReducer : public Next {
TRACE("> Emitting unpeeled loop body");
__ CloneSubGraph(loop_body, /* keep_loop_kinds */ true,
/* is_loop_after_peeling */ true);
+ return true;
}
bool CanPeelLoop(const Block* header) {
diff --git a/src/compiler/turboshaft/loop-unrolling-reducer.h b/src/compiler/turboshaft/loop-unrolling-reducer.h
index 181d298bfa27d21f013016b34a586078d12f8a58..92d6f7b36d4c5c0a64723f7d18427a62347bad9f 100644
--- a/src/compiler/turboshaft/loop-unrolling-reducer.h
+++ b/src/compiler/turboshaft/loop-unrolling-reducer.h
@@ -211,6 +211,11 @@ class V8_EXPORT_PRIVATE LoopUnrollingAnalyzer {
info.op_count < kMaxLoopSizeForPartialUnrolling;
}
+ size_t GetLoopOpCount(const Block* loop_header) {
+ DCHECK(loop_header->IsLoop());
+ return loop_finder_.GetLoopInfo(loop_header).op_count;
+ }
+
// The returned unroll count is the total number of copies of the loop body
// in the resulting graph, i.e., an unroll count of N means N-1 copies of the
// body which were partially unrolled, and 1 for the original/remaining body.
@@ -383,14 +388,12 @@ class LoopUnrollingReducer : public Next {
// header (note that loop headers only have 2 predecessor, including the
// backedge), and that isn't the backedge.
if (ShouldSkipOptimizationStep()) goto no_change;
- if (analyzer_.ShouldRemoveLoop(dst)) {
- RemoveLoop(dst);
+ if (analyzer_.ShouldRemoveLoop(dst) && RemoveLoop(dst)) {
return {};
- } else if (analyzer_.ShouldFullyUnrollLoop(dst)) {
- FullyUnrollLoop(dst);
+ } else if (analyzer_.ShouldFullyUnrollLoop(dst) && FullyUnrollLoop(dst)) {
return {};
- } else if (analyzer_.ShouldPartiallyUnrollLoop(dst)) {
- PartiallyUnrollLoop(dst);
+ } else if (analyzer_.ShouldPartiallyUnrollLoop(dst) &&
+ PartiallyUnrollLoop(dst)) {
return {};
}
} else if ((unrolling_ == UnrollingStatus::kUnrolling) &&
@@ -467,9 +470,9 @@ class LoopUnrollingReducer : public Next {
// and would like to not emit the loop body that follows.
kRemoveLoop,
};
- void RemoveLoop(const Block* header);
- void FullyUnrollLoop(const Block* header);
- void PartiallyUnrollLoop(const Block* header);
+ bool RemoveLoop(const Block* header);
+ bool FullyUnrollLoop(const Block* header);
+ bool PartiallyUnrollLoop(const Block* header);
void FixLoopPhis(const Block* input_graph_loop, Block* output_graph_loop,
const Block* backedge_block);
bool IsRunningBuiltinPipeline() {
@@ -508,10 +511,16 @@ class LoopUnrollingReducer : public Next {
};
template <class Next>
-void LoopUnrollingReducer<Next>::PartiallyUnrollLoop(const Block* header) {
+bool LoopUnrollingReducer<Next>::PartiallyUnrollLoop(const Block* header) {
TRACE("LoopUnrolling: partially unrolling loop at " << header->index().id());
DCHECK_EQ(unrolling_, UnrollingStatus::kNotUnrolling);
DCHECK(!skip_next_stack_check_);
+
+ if (!__ CanCreateNVariables(analyzer_.GetLoopOpCount(header))) {
+ TRACE("> Too many variables, skipping unrolling");
+ return false;
+ }
+
unrolling_ = UnrollingStatus::kUnrolling;
auto loop_body = analyzer_.GetLoopBody(header);
@@ -533,7 +542,7 @@ void LoopUnrollingReducer<Next>::PartiallyUnrollLoop(const Block* header) {
__ CloneSubGraph(loop_body, /* keep_loop_kinds */ true);
if (StopUnrollingIfUnreachable(output_graph_header)) {
TRACE("> Next iteration is unreachable, stopping unrolling");
- return;
+ return true;
}
// Emitting the subsequent folded iterations. We set `unrolling_` to
@@ -549,7 +558,7 @@ void LoopUnrollingReducer<Next>::PartiallyUnrollLoop(const Block* header) {
__ CloneSubGraph(loop_body, /* keep_loop_kinds */ false);
if (StopUnrollingIfUnreachable(output_graph_header)) {
TRACE("> Next iteration is unreachable, stopping unrolling");
- return;
+ return true;
}
}
@@ -567,6 +576,7 @@ void LoopUnrollingReducer<Next>::PartiallyUnrollLoop(const Block* header) {
unrolling_ = UnrollingStatus::kNotUnrolling;
TRACE("> Finished partially unrolling loop " << header->index().id());
+ return true;
}
template <class Next>
@@ -622,10 +632,20 @@ void LoopUnrollingReducer<Next>::FixLoopPhis(const Block* input_graph_loop,
}
template <class Next>
-void LoopUnrollingReducer<Next>::RemoveLoop(const Block* header) {
+bool LoopUnrollingReducer<Next>::RemoveLoop(const Block* header) {
TRACE("LoopUnrolling: removing loop at " << header->index().id());
DCHECK_EQ(unrolling_, UnrollingStatus::kNotUnrolling);
DCHECK(!skip_next_stack_check_);
+
+ if (!__ CanCreateNVariables(analyzer_.GetLoopOpCount(header))) {
+ TRACE("> Too many variables, skipping removal");
+ // TODO(dmercadier): in theory, RemoveLoop shouldn't need Variables, since
+ // it cannot be called while unrolling an outer loop, since we only unroll
+ // innermost loops. We should teach CloneAndInlineBlock that it doesn't
+ // always need to introduce Variables, and then remove this bailout.
+ return false;
+ }
+
// When removing a loop, we still need to emit the header (since it has to
// always be executed before the 1st iteration anyways), but by setting
// {unrolling_} to `kRemoveLoop`, the final Branch of the loop will become a
@@ -633,15 +653,21 @@ void LoopUnrollingReducer<Next>::RemoveLoop(const Block* header) {
unrolling_ = UnrollingStatus::kRemoveLoop;
__ CloneAndInlineBlock(header);
unrolling_ = UnrollingStatus::kNotUnrolling;
+ return true;
}
template <class Next>
-void LoopUnrollingReducer<Next>::FullyUnrollLoop(const Block* header) {
+bool LoopUnrollingReducer<Next>::FullyUnrollLoop(const Block* header) {
TRACE("LoopUnrolling: fully unrolling loop at " << header->index().id());
DCHECK_EQ(unrolling_, UnrollingStatus::kNotUnrolling);
DCHECK(!skip_next_stack_check_);
ScopedModification<bool> skip_stack_checks(&skip_next_stack_check_, true);
+ if (!__ CanCreateNVariables(analyzer_.GetLoopOpCount(header))) {
+ TRACE("> Too many variables, skipping unrolling");
+ return false;
+ }
+
size_t iter_count = analyzer_.GetIterationCount(header).exact_count();
TRACE("> iter_count: " << iter_count);
@@ -654,7 +680,7 @@ void LoopUnrollingReducer<Next>::FullyUnrollLoop(const Block* header) {
__ CloneSubGraph(loop_body, /* keep_loop_kinds */ false);
if (StopUnrollingIfUnreachable()) {
TRACE("> Next iteration is unreachable, stopping unrolling");
- return;
+ return true;
}
}
@@ -667,6 +693,7 @@ void LoopUnrollingReducer<Next>::FullyUnrollLoop(const Block* header) {
unrolling_ = UnrollingStatus::kNotUnrolling;
TRACE("> Finished fully unrolling loop " << header->index().id());
+ return true;
}
#undef TRACE
diff --git a/src/compiler/turboshaft/turbolev-graph-builder.cc b/src/compiler/turboshaft/turbolev-graph-builder.cc
index 3db187b8c48cc0c7168be039e7d90078c4df7bda..d80362036da4c80e192ed489e3c66e8bfed271ba 100644
--- a/src/compiler/turboshaft/turbolev-graph-builder.cc
+++ b/src/compiler/turboshaft/turbolev-graph-builder.cc
@@ -118,12 +118,7 @@ class BlockOriginTrackingReducer : public Next {
}
void Bind(Block* block) {
Next::Bind(block);
- // The 1st block we bind doesn't exist in Maglev and is meant to hold
- // Constants (which in Maglev are not in any block), and thus
- // {maglev_input_block_} should still be nullptr. In all other cases,
- // {maglev_input_block_} should not be nullptr.
- DCHECK_EQ(maglev_input_block_ == nullptr,
- block == &__ output_graph().StartBlock());
+ DCHECK_NOT_NULL(maglev_input_block_);
turboshaft_block_origins_[block->index()] = maglev_input_block_;
}
@@ -519,9 +514,11 @@ class GraphBuildingNodeProcessor {
block_mapping_[block] =
block->is_loop() ? __ NewLoopHeader() : __ NewBlock();
}
- // Constants are not in a block in Maglev but are in Turboshaft. We bind a
- // block now, so that Constants can then be emitted.
- __ Bind(__ NewBlock());
+ // Constants are not in a block in Maglev but are in Turboshaft. We bind the
+ // 1st block now, so that Constants can then be emitted.
+ const maglev::BasicBlock* first_maglev_block = graph->blocks().front();
+ __ SetMaglevInputBlock(first_maglev_block);
+ __ Bind(block_mapping_[first_maglev_block]);
// Initializing undefined constant so that we don't need to recreate it too
// often.
@@ -607,9 +604,20 @@ class GraphBuildingNodeProcessor {
Block* turboshaft_block = Map(maglev_block);
if (__ current_block() != nullptr) {
- // The first block for Constants doesn't end with a Jump, so we add one
- // now.
- __ Goto(turboshaft_block);
+ // We must be in the first block of the graph, inserted by Turboshaft in
+ // PreProcessGraph so that constants can be bound in a block. No need to
+ // do anything else: we don't emit a Goto so that the actual 1st block of
+ // the Maglev graph gets inlined into this first block of the Turboshaft
+ // graph, which, in addition to saving a Goto, saves the need to clone the
+ // destination into the current block later, and also ensures that
+ // Parameters are always in the 1st block.
+ DCHECK_EQ(__ output_graph().block_count(), 1);
+ DCHECK_EQ(maglev_block->id(), 0);
+ DCHECK_EQ(__ current_block(), block_mapping_[maglev_block]);
+ // maglev_input_block should have been set by calling SetMaglevInputBlock
+ // in PreProcessGraph.
+ DCHECK_EQ(__ maglev_input_block(), maglev_block);
+ return maglev::BlockProcessResult::kContinue;
}
#ifdef DEBUG
diff --git a/src/compiler/turboshaft/variable-reducer.h b/src/compiler/turboshaft/variable-reducer.h
index b11338bdf6e928cd09a0bdbad42fd835c8210c36..03cc2fa77f0d4a194893a8be5747d6de887e5ee9 100644
--- a/src/compiler/turboshaft/variable-reducer.h
+++ b/src/compiler/turboshaft/variable-reducer.h
@@ -9,6 +9,7 @@
#include <optional>
#include "src/base/logging.h"
+#include "src/base/macros.h"
#include "src/codegen/machine-type.h"
#include "src/compiler/turboshaft/assembler.h"
#include "src/compiler/turboshaft/graph.h"
@@ -91,6 +92,15 @@ class VariableReducer : public RequiredOptimizationReducer<AfterNext> {
public:
TURBOSHAFT_REDUCER_BOILERPLATE(VariableReducer)
+ ~VariableReducer() {
+ if (too_many_variables_bailouts_count_ != 0 &&
+ V8_UNLIKELY(v8_flags.trace_turbo_bailouts)) {
+ std::cout << "Bailing out from block cloning "
+ << too_many_variables_bailouts_count_ << " time"
+ << (too_many_variables_bailouts_count_ > 1 ? "s" : "") << "\n";
+ }
+ }
+
void Bind(Block* new_block) {
Next::Bind(new_block);
@@ -190,6 +200,26 @@ class VariableReducer : public RequiredOptimizationReducer<AfterNext> {
return table_.GetPredecessorValue(var, predecessor_index);
}
+ bool CanCreateNVariables(size_t n) {
+ // Merges with many predecessors combined with many variables can quickly
+ // blow up memory since the SnapshotTable needs to create a state whose
+ // size can be up to number_of_predecessor*variable_count (note: in
+ // practice, it's often not quite variable_count but less since only
+ // variables that are live in at least one predecessor are counted). To
+ // avoid OOM or otherwise huge memory consumption, we thus stop creating
+ // variables (and bail out on optimizations that need variables) when this
+ // number becomes too large. I somewhat arbitrarily selected 100K here,
+ // which sounds high, but in terms of memory, it's just 100K*8=800KB, which
+ // is less than 1MB, which isn't going to amount for much in a function
+ // that is probably very large if it managed to reach this limit.
+ constexpr uint32_t kMaxAllowedMergeStateSize = 100'000;
+ bool can_create =
+ __ input_graph().max_merge_pred_count() * (variable_count_ + n) <
+ kMaxAllowedMergeStateSize;
+ if (!can_create) too_many_variables_bailouts_count_++;
+ return can_create;
+ }
+
void SetVariable(Variable var, OpIndex new_index) {
DCHECK(!is_temporary_);
if (V8_UNLIKELY(__ generating_unreachable_operations())) return;
@@ -206,10 +236,12 @@ class VariableReducer : public RequiredOptimizationReducer<AfterNext> {
Variable NewLoopInvariantVariable(MaybeRegisterRepresentation rep) {
DCHECK(!is_temporary_);
+ variable_count_++;
return table_.NewKey(VariableData{rep, true}, OpIndex::Invalid());
}
Variable NewVariable(MaybeRegisterRepresentation rep) {
DCHECK(!is_temporary_);
+ variable_count_++;
return table_.NewKey(VariableData{rep, false}, OpIndex::Invalid());
}
@@ -314,6 +346,10 @@ class VariableReducer : public RequiredOptimizationReducer<AfterNext> {
__ input_graph().block_count(), std::nullopt, __ phase_zone()};
bool is_temporary_ = false;
+ // Tracks the number of variables that have been created.
+ uint32_t variable_count_ = 0;
+ uint32_t too_many_variables_bailouts_count_ = 0;
+
// {predecessors_} is used during merging, but we use an instance variable for
// it, in order to save memory and not reallocate it for each merge.
ZoneVector<Snapshot> predecessors_{__ phase_zone()};
diff --git a/test/unittests/compiler/turboshaft/control-flow-unittest.cc b/test/unittests/compiler/turboshaft/control-flow-unittest.cc
index 49e1c8c2561bd010d12e5229c4d6594b9846b40b..b39b073a2ea899550fe0df6a81dcebc2d75efa49 100644
--- a/test/unittests/compiler/turboshaft/control-flow-unittest.cc
+++ b/test/unittests/compiler/turboshaft/control-flow-unittest.cc
@@ -55,7 +55,7 @@ TEST_F(ControlFlowTest, DefaultBlockInlining) {
// BranchElimination should remove such branches by cloning the block with the
// branch. In the end, the graph should contain (almost) no branches anymore.
TEST_F(ControlFlowTest, BranchElimination) {
- static constexpr int kSize = 10000;
+ static constexpr int kSize = 200;
auto test = CreateFromGraph(1, [](auto& Asm) {
V<Word32> cond =