From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Darius Mercadier Date: Wed, 5 Nov 2025 14:06:54 +0100 Subject: [turboshaft] Avoid introducing too many Variables .... if we have very large merges. Cf https://crbug.com/418027512#comment5 for explanations of why this is necessary (and the following comment for why I don't see a good alternative to this CL). I've locally confirmed that this fixes the OOM from https://crbug.com/457625181, and it reduces memory consumption on binaries/crbug-40219016-zelda/zelda.wasm (from https://crbug.com/418027512) by 20+%. Bug: 418027512, 457625181 Change-Id: If55af659667723ce85ff71bcac66a43aff863e05 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/7119378 Commit-Queue: Darius Mercadier Auto-Submit: Darius Mercadier Reviewed-by: Matthias Liedtke Cr-Commit-Position: refs/heads/main@{#103534} diff --git a/src/compiler/turboshaft/branch-elimination-reducer.h b/src/compiler/turboshaft/branch-elimination-reducer.h index f115c86894f0cf739d6381f7844e5589831cc209..d917d27bd3964ba07b41efa49b86435ae7720064 100644 --- a/src/compiler/turboshaft/branch-elimination-reducer.h +++ b/src/compiler/turboshaft/branch-elimination-reducer.h @@ -323,6 +323,10 @@ class BranchEliminationReducer : public Next { goto no_change; } + if (!__ CanCreateNVariables(destination_origin->OpCountUpperBound())) { + goto no_change; + } + if (const BranchOp* branch = last_op.template TryCast()) { V condition = __ template MapToNewGraph(branch->condition()); diff --git a/src/compiler/turboshaft/copying-phase.h b/src/compiler/turboshaft/copying-phase.h index 875861d005435b1c2a1591886c053ca360c3e2f2..b43958499d5b6d6e72b81d965d0729bb213c7ae6 100644 --- a/src/compiler/turboshaft/copying-phase.h +++ b/src/compiler/turboshaft/copying-phase.h @@ -714,9 +714,23 @@ class GraphVisitor : public OutputGraphAssembler, if (Asm().CanAutoInlineBlocksWithSinglePredecessor() && terminator.Is()) { Block* destination = terminator.Cast().destination; - if (destination->PredecessorCount() == 1) { - block_to_inline_now_ = destination; - return; + // Inlining the destination will require setting it in needs_variables_ + // mode; we thus check that we can actually create enough variables to do + // this. + // TODO(dmercadier): in practice, the only reason we need variables for + // the destination is because we could be currently in a phase that cloned + // the current block, which could lead to {destination} being cloned as + // well. No all phases can do this, so we could check that we're not in + // such a phase, and if so, not use variables for the destination. One way + // to do this would be to have a DisallowCloningReducer which would + // static_assert that LoopUnrolling/LoopPeeling/BranchElimination aren't + // on the stack and would also prevent using CloneSubGraph, + // CloneAndInlineBlock and CloneBlockAndGoto. + if (Asm().CanCreateNVariables(destination->OpCountUpperBound())) { + if (destination->PredecessorCount() == 1) { + block_to_inline_now_ = destination; + return; + } } } // Just going through the regular VisitOp function. diff --git a/src/compiler/turboshaft/graph.h b/src/compiler/turboshaft/graph.h index 936c8b0269a9b87a4ffa20c40bbd908fb8c69010..a3c1c40e4e7097f518e107d85786c7cc5466e595 100644 --- a/src/compiler/turboshaft/graph.h +++ b/src/compiler/turboshaft/graph.h @@ -608,6 +608,7 @@ class Graph { operation_origins_.Reset(); operation_types_.Reset(); dominator_tree_depth_ = 0; + max_merge_pred_count_ = 0; #ifdef DEBUG block_type_refinement_.Reset(); // Do not reset of graph_created_from_turbofan_ as it is propagated along @@ -791,6 +792,8 @@ class Graph { bound_blocks_.push_back(block); uint32_t depth = block->ComputeDominator(); dominator_tree_depth_ = std::max(dominator_tree_depth_, depth); + max_merge_pred_count_ = + std::max(max_merge_pred_count_, block->PredecessorCount()); #ifdef DEBUG if (v8_flags.turboshaft_trace_emitted) { @@ -1016,6 +1019,8 @@ class Graph { uint32_t DominatorTreeDepth() const { return dominator_tree_depth_; } + uint32_t max_merge_pred_count() const { return max_merge_pred_count_; } + const GrowingOpIndexSidetable& operation_types() const { return operation_types_; } @@ -1068,6 +1073,7 @@ class Graph { std::swap(next_block_, companion.next_block_); std::swap(block_permutation_, companion.block_permutation_); std::swap(graph_zone_, companion.graph_zone_); + std::swap(max_merge_pred_count_, companion.max_merge_pred_count_); op_to_block_.SwapData(companion.op_to_block_); source_positions_.SwapData(companion.source_positions_); operation_origins_.SwapData(companion.operation_origins_); @@ -1206,6 +1212,9 @@ class Graph { GrowingOpIndexSidetable source_positions_; GrowingOpIndexSidetable operation_origins_; uint32_t dominator_tree_depth_ = 0; + // {max_merge_pred_count_} stores the maximum number of predecessors that any + // Merge in the graph has. + uint32_t max_merge_pred_count_ = 0; GrowingOpIndexSidetable operation_types_; #ifdef DEBUG GrowingBlockSidetable block_type_refinement_; diff --git a/src/compiler/turboshaft/loop-peeling-reducer.h b/src/compiler/turboshaft/loop-peeling-reducer.h index a9b5eaaf4c88354164b3a5833d4bd6b2760b12a0..b7df7acb61d048669a2cacfbc4e2156df69788dc 100644 --- a/src/compiler/turboshaft/loop-peeling-reducer.h +++ b/src/compiler/turboshaft/loop-peeling-reducer.h @@ -57,8 +57,7 @@ class LoopPeelingReducer : public Next { const Block* dst = gto.destination; if (dst->IsLoop() && !gto.is_backedge && CanPeelLoop(dst)) { if (ShouldSkipOptimizationStep()) goto no_change; - PeelFirstIteration(dst); - return {}; + if (PeelFirstIteration(dst)) return {}; } else if (IsEmittingPeeledIteration() && dst == current_loop_header_) { // We skip the backedge of the loop: PeelFirstIeration will instead emit a // forward edge to the non-peeled header. @@ -111,13 +110,21 @@ class LoopPeelingReducer : public Next { kEmittingUnpeeledBody }; - void PeelFirstIteration(const Block* header) { + bool PeelFirstIteration(const Block* header) { TRACE("LoopPeeling: peeling loop at " << header->index()); DCHECK_EQ(peeling_, PeelingStatus::kNotPeeling); ScopedModification scope(&peeling_, PeelingStatus::kEmittingPeeledLoop); current_loop_header_ = header; + constexpr int kNumberOfLoopCopies = 2; // peeled + unpeeled + size_t op_count_upper_bound = + loop_finder_.GetLoopInfo(header).op_count * kNumberOfLoopCopies; + if (!__ CanCreateNVariables(op_count_upper_bound)) { + TRACE("> Too many variables, skipping peeling"); + return false; + } + // Emitting the peeled iteration. auto loop_body = loop_finder_.GetLoopBody(header); // Note that this call to CloneSubGraph will not emit the backedge because @@ -133,7 +140,7 @@ class LoopPeelingReducer : public Next { // While peeling, we realized that the 2nd iteration of the loop is not // reachable. TRACE("> Second iteration is not reachable, stopping now"); - return; + return true; } // We now emit the regular unpeeled loop. @@ -141,6 +148,7 @@ class LoopPeelingReducer : public Next { TRACE("> Emitting unpeeled loop body"); __ CloneSubGraph(loop_body, /* keep_loop_kinds */ true, /* is_loop_after_peeling */ true); + return true; } bool CanPeelLoop(const Block* header) { diff --git a/src/compiler/turboshaft/loop-unrolling-reducer.h b/src/compiler/turboshaft/loop-unrolling-reducer.h index 181d298bfa27d21f013016b34a586078d12f8a58..92d6f7b36d4c5c0a64723f7d18427a62347bad9f 100644 --- a/src/compiler/turboshaft/loop-unrolling-reducer.h +++ b/src/compiler/turboshaft/loop-unrolling-reducer.h @@ -211,6 +211,11 @@ class V8_EXPORT_PRIVATE LoopUnrollingAnalyzer { info.op_count < kMaxLoopSizeForPartialUnrolling; } + size_t GetLoopOpCount(const Block* loop_header) { + DCHECK(loop_header->IsLoop()); + return loop_finder_.GetLoopInfo(loop_header).op_count; + } + // The returned unroll count is the total number of copies of the loop body // in the resulting graph, i.e., an unroll count of N means N-1 copies of the // body which were partially unrolled, and 1 for the original/remaining body. @@ -383,14 +388,12 @@ class LoopUnrollingReducer : public Next { // header (note that loop headers only have 2 predecessor, including the // backedge), and that isn't the backedge. if (ShouldSkipOptimizationStep()) goto no_change; - if (analyzer_.ShouldRemoveLoop(dst)) { - RemoveLoop(dst); + if (analyzer_.ShouldRemoveLoop(dst) && RemoveLoop(dst)) { return {}; - } else if (analyzer_.ShouldFullyUnrollLoop(dst)) { - FullyUnrollLoop(dst); + } else if (analyzer_.ShouldFullyUnrollLoop(dst) && FullyUnrollLoop(dst)) { return {}; - } else if (analyzer_.ShouldPartiallyUnrollLoop(dst)) { - PartiallyUnrollLoop(dst); + } else if (analyzer_.ShouldPartiallyUnrollLoop(dst) && + PartiallyUnrollLoop(dst)) { return {}; } } else if ((unrolling_ == UnrollingStatus::kUnrolling) && @@ -467,9 +470,9 @@ class LoopUnrollingReducer : public Next { // and would like to not emit the loop body that follows. kRemoveLoop, }; - void RemoveLoop(const Block* header); - void FullyUnrollLoop(const Block* header); - void PartiallyUnrollLoop(const Block* header); + bool RemoveLoop(const Block* header); + bool FullyUnrollLoop(const Block* header); + bool PartiallyUnrollLoop(const Block* header); void FixLoopPhis(const Block* input_graph_loop, Block* output_graph_loop, const Block* backedge_block); bool IsRunningBuiltinPipeline() { @@ -508,10 +511,16 @@ class LoopUnrollingReducer : public Next { }; template -void LoopUnrollingReducer::PartiallyUnrollLoop(const Block* header) { +bool LoopUnrollingReducer::PartiallyUnrollLoop(const Block* header) { TRACE("LoopUnrolling: partially unrolling loop at " << header->index().id()); DCHECK_EQ(unrolling_, UnrollingStatus::kNotUnrolling); DCHECK(!skip_next_stack_check_); + + if (!__ CanCreateNVariables(analyzer_.GetLoopOpCount(header))) { + TRACE("> Too many variables, skipping unrolling"); + return false; + } + unrolling_ = UnrollingStatus::kUnrolling; auto loop_body = analyzer_.GetLoopBody(header); @@ -533,7 +542,7 @@ void LoopUnrollingReducer::PartiallyUnrollLoop(const Block* header) { __ CloneSubGraph(loop_body, /* keep_loop_kinds */ true); if (StopUnrollingIfUnreachable(output_graph_header)) { TRACE("> Next iteration is unreachable, stopping unrolling"); - return; + return true; } // Emitting the subsequent folded iterations. We set `unrolling_` to @@ -549,7 +558,7 @@ void LoopUnrollingReducer::PartiallyUnrollLoop(const Block* header) { __ CloneSubGraph(loop_body, /* keep_loop_kinds */ false); if (StopUnrollingIfUnreachable(output_graph_header)) { TRACE("> Next iteration is unreachable, stopping unrolling"); - return; + return true; } } @@ -567,6 +576,7 @@ void LoopUnrollingReducer::PartiallyUnrollLoop(const Block* header) { unrolling_ = UnrollingStatus::kNotUnrolling; TRACE("> Finished partially unrolling loop " << header->index().id()); + return true; } template @@ -622,10 +632,20 @@ void LoopUnrollingReducer::FixLoopPhis(const Block* input_graph_loop, } template -void LoopUnrollingReducer::RemoveLoop(const Block* header) { +bool LoopUnrollingReducer::RemoveLoop(const Block* header) { TRACE("LoopUnrolling: removing loop at " << header->index().id()); DCHECK_EQ(unrolling_, UnrollingStatus::kNotUnrolling); DCHECK(!skip_next_stack_check_); + + if (!__ CanCreateNVariables(analyzer_.GetLoopOpCount(header))) { + TRACE("> Too many variables, skipping removal"); + // TODO(dmercadier): in theory, RemoveLoop shouldn't need Variables, since + // it cannot be called while unrolling an outer loop, since we only unroll + // innermost loops. We should teach CloneAndInlineBlock that it doesn't + // always need to introduce Variables, and then remove this bailout. + return false; + } + // When removing a loop, we still need to emit the header (since it has to // always be executed before the 1st iteration anyways), but by setting // {unrolling_} to `kRemoveLoop`, the final Branch of the loop will become a @@ -633,15 +653,21 @@ void LoopUnrollingReducer::RemoveLoop(const Block* header) { unrolling_ = UnrollingStatus::kRemoveLoop; __ CloneAndInlineBlock(header); unrolling_ = UnrollingStatus::kNotUnrolling; + return true; } template -void LoopUnrollingReducer::FullyUnrollLoop(const Block* header) { +bool LoopUnrollingReducer::FullyUnrollLoop(const Block* header) { TRACE("LoopUnrolling: fully unrolling loop at " << header->index().id()); DCHECK_EQ(unrolling_, UnrollingStatus::kNotUnrolling); DCHECK(!skip_next_stack_check_); ScopedModification skip_stack_checks(&skip_next_stack_check_, true); + if (!__ CanCreateNVariables(analyzer_.GetLoopOpCount(header))) { + TRACE("> Too many variables, skipping unrolling"); + return false; + } + size_t iter_count = analyzer_.GetIterationCount(header).exact_count(); TRACE("> iter_count: " << iter_count); @@ -654,7 +680,7 @@ void LoopUnrollingReducer::FullyUnrollLoop(const Block* header) { __ CloneSubGraph(loop_body, /* keep_loop_kinds */ false); if (StopUnrollingIfUnreachable()) { TRACE("> Next iteration is unreachable, stopping unrolling"); - return; + return true; } } @@ -667,6 +693,7 @@ void LoopUnrollingReducer::FullyUnrollLoop(const Block* header) { unrolling_ = UnrollingStatus::kNotUnrolling; TRACE("> Finished fully unrolling loop " << header->index().id()); + return true; } #undef TRACE diff --git a/src/compiler/turboshaft/turbolev-graph-builder.cc b/src/compiler/turboshaft/turbolev-graph-builder.cc index 3db187b8c48cc0c7168be039e7d90078c4df7bda..d80362036da4c80e192ed489e3c66e8bfed271ba 100644 --- a/src/compiler/turboshaft/turbolev-graph-builder.cc +++ b/src/compiler/turboshaft/turbolev-graph-builder.cc @@ -118,12 +118,7 @@ class BlockOriginTrackingReducer : public Next { } void Bind(Block* block) { Next::Bind(block); - // The 1st block we bind doesn't exist in Maglev and is meant to hold - // Constants (which in Maglev are not in any block), and thus - // {maglev_input_block_} should still be nullptr. In all other cases, - // {maglev_input_block_} should not be nullptr. - DCHECK_EQ(maglev_input_block_ == nullptr, - block == &__ output_graph().StartBlock()); + DCHECK_NOT_NULL(maglev_input_block_); turboshaft_block_origins_[block->index()] = maglev_input_block_; } @@ -519,9 +514,11 @@ class GraphBuildingNodeProcessor { block_mapping_[block] = block->is_loop() ? __ NewLoopHeader() : __ NewBlock(); } - // Constants are not in a block in Maglev but are in Turboshaft. We bind a - // block now, so that Constants can then be emitted. - __ Bind(__ NewBlock()); + // Constants are not in a block in Maglev but are in Turboshaft. We bind the + // 1st block now, so that Constants can then be emitted. + const maglev::BasicBlock* first_maglev_block = graph->blocks().front(); + __ SetMaglevInputBlock(first_maglev_block); + __ Bind(block_mapping_[first_maglev_block]); // Initializing undefined constant so that we don't need to recreate it too // often. @@ -607,9 +604,20 @@ class GraphBuildingNodeProcessor { Block* turboshaft_block = Map(maglev_block); if (__ current_block() != nullptr) { - // The first block for Constants doesn't end with a Jump, so we add one - // now. - __ Goto(turboshaft_block); + // We must be in the first block of the graph, inserted by Turboshaft in + // PreProcessGraph so that constants can be bound in a block. No need to + // do anything else: we don't emit a Goto so that the actual 1st block of + // the Maglev graph gets inlined into this first block of the Turboshaft + // graph, which, in addition to saving a Goto, saves the need to clone the + // destination into the current block later, and also ensures that + // Parameters are always in the 1st block. + DCHECK_EQ(__ output_graph().block_count(), 1); + DCHECK_EQ(maglev_block->id(), 0); + DCHECK_EQ(__ current_block(), block_mapping_[maglev_block]); + // maglev_input_block should have been set by calling SetMaglevInputBlock + // in PreProcessGraph. + DCHECK_EQ(__ maglev_input_block(), maglev_block); + return maglev::BlockProcessResult::kContinue; } #ifdef DEBUG diff --git a/src/compiler/turboshaft/variable-reducer.h b/src/compiler/turboshaft/variable-reducer.h index b11338bdf6e928cd09a0bdbad42fd835c8210c36..03cc2fa77f0d4a194893a8be5747d6de887e5ee9 100644 --- a/src/compiler/turboshaft/variable-reducer.h +++ b/src/compiler/turboshaft/variable-reducer.h @@ -9,6 +9,7 @@ #include #include "src/base/logging.h" +#include "src/base/macros.h" #include "src/codegen/machine-type.h" #include "src/compiler/turboshaft/assembler.h" #include "src/compiler/turboshaft/graph.h" @@ -91,6 +92,15 @@ class VariableReducer : public RequiredOptimizationReducer { public: TURBOSHAFT_REDUCER_BOILERPLATE(VariableReducer) + ~VariableReducer() { + if (too_many_variables_bailouts_count_ != 0 && + V8_UNLIKELY(v8_flags.trace_turbo_bailouts)) { + std::cout << "Bailing out from block cloning " + << too_many_variables_bailouts_count_ << " time" + << (too_many_variables_bailouts_count_ > 1 ? "s" : "") << "\n"; + } + } + void Bind(Block* new_block) { Next::Bind(new_block); @@ -190,6 +200,26 @@ class VariableReducer : public RequiredOptimizationReducer { return table_.GetPredecessorValue(var, predecessor_index); } + bool CanCreateNVariables(size_t n) { + // Merges with many predecessors combined with many variables can quickly + // blow up memory since the SnapshotTable needs to create a state whose + // size can be up to number_of_predecessor*variable_count (note: in + // practice, it's often not quite variable_count but less since only + // variables that are live in at least one predecessor are counted). To + // avoid OOM or otherwise huge memory consumption, we thus stop creating + // variables (and bail out on optimizations that need variables) when this + // number becomes too large. I somewhat arbitrarily selected 100K here, + // which sounds high, but in terms of memory, it's just 100K*8=800KB, which + // is less than 1MB, which isn't going to amount for much in a function + // that is probably very large if it managed to reach this limit. + constexpr uint32_t kMaxAllowedMergeStateSize = 100'000; + bool can_create = + __ input_graph().max_merge_pred_count() * (variable_count_ + n) < + kMaxAllowedMergeStateSize; + if (!can_create) too_many_variables_bailouts_count_++; + return can_create; + } + void SetVariable(Variable var, OpIndex new_index) { DCHECK(!is_temporary_); if (V8_UNLIKELY(__ generating_unreachable_operations())) return; @@ -206,10 +236,12 @@ class VariableReducer : public RequiredOptimizationReducer { Variable NewLoopInvariantVariable(MaybeRegisterRepresentation rep) { DCHECK(!is_temporary_); + variable_count_++; return table_.NewKey(VariableData{rep, true}, OpIndex::Invalid()); } Variable NewVariable(MaybeRegisterRepresentation rep) { DCHECK(!is_temporary_); + variable_count_++; return table_.NewKey(VariableData{rep, false}, OpIndex::Invalid()); } @@ -314,6 +346,10 @@ class VariableReducer : public RequiredOptimizationReducer { __ input_graph().block_count(), std::nullopt, __ phase_zone()}; bool is_temporary_ = false; + // Tracks the number of variables that have been created. + uint32_t variable_count_ = 0; + uint32_t too_many_variables_bailouts_count_ = 0; + // {predecessors_} is used during merging, but we use an instance variable for // it, in order to save memory and not reallocate it for each merge. ZoneVector predecessors_{__ phase_zone()}; diff --git a/test/unittests/compiler/turboshaft/control-flow-unittest.cc b/test/unittests/compiler/turboshaft/control-flow-unittest.cc index 49e1c8c2561bd010d12e5229c4d6594b9846b40b..b39b073a2ea899550fe0df6a81dcebc2d75efa49 100644 --- a/test/unittests/compiler/turboshaft/control-flow-unittest.cc +++ b/test/unittests/compiler/turboshaft/control-flow-unittest.cc @@ -55,7 +55,7 @@ TEST_F(ControlFlowTest, DefaultBlockInlining) { // BranchElimination should remove such branches by cloning the block with the // branch. In the end, the graph should contain (almost) no branches anymore. TEST_F(ControlFlowTest, BranchElimination) { - static constexpr int kSize = 10000; + static constexpr int kSize = 200; auto test = CreateFromGraph(1, [](auto& Asm) { V cond =