From be585661a8d4d239eb9f49965d87f687a1eaacd4 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Tue, 2 Apr 2024 18:02:50 -0700
Subject: [PATCH] Implement GPU frustum culling.

This commit implements opt-in GPU frustum culling, built on top of the
infrastructure in #12773. To enable it on a camera, add the `GpuCulling`
component to it. To additionally disable CPU frustum culling, add the
`NoCpuCulling` component. Note that adding `GpuCulling` without
`NoCpuCulling` *currently* does nothing useful. The reason why
`GpuCulling` doesn't automatically imply `NoCpuCulling` is that I intend
to follow this patch up with GPU two-phase occlusion culling, and CPU
frustum culling plus GPU occlusion culling seems like a very
commonly-desired mode.

Adding the `GpuCulling` component frustum to a view puts that view into
*indirect mode*. This mode makes all drawcalls indirect, relying on the
mesh preprocessing shader to allocate instances dynamically. In indirect
mode, the `PreprocessWorkItem` `output_index` points not to a
`MeshUniform` instance slot but instead to a set of `wgpu`
`IndirectParameters`, from which it allocates an instance slot
dynamically if frustum culling succeeds. Batch building has been updated
to allocate and track indirect parameter slots, and the AABBs are now
supplied to the GPU as `MeshCullingData`.

A small amount of code relating to the frustum culling has been borrowed
from meshlets and moved into `maths.wgsl`. Note that standard Bevy
frustum culling uses AABBs, while meshlets use bounding spheres; this
means that not as much code can be shared as one might think.

This patch doesn't provide any way to perform GPU culling on shadow
maps, to avoid making this patch bigger than it already is. That can be
a followup.
---
 crates/bevy_core_pipeline/src/core_2d/mod.rs  |  13 +-
 crates/bevy_core_pipeline/src/core_3d/mod.rs  |  54 +-
 crates/bevy_core_pipeline/src/deferred/mod.rs |  34 +-
 crates/bevy_core_pipeline/src/prepass/mod.rs  |  34 +-
 crates/bevy_gizmos/src/pipeline_2d.rs         |   8 +-
 crates/bevy_gizmos/src/pipeline_3d.rs         |   8 +-
 crates/bevy_pbr/src/material.rs               |   6 +-
 .../bevy_pbr/src/meshlet/cull_meshlets.wgsl   |  11 +-
 crates/bevy_pbr/src/render/gpu_preprocess.rs  | 329 +++++++++---
 crates/bevy_pbr/src/render/light.rs           |  15 +-
 crates/bevy_pbr/src/render/mesh.rs            | 480 ++++++++++++++----
 .../bevy_pbr/src/render/mesh_preprocess.wgsl  | 144 +++++-
 .../src/batching/gpu_preprocessing.rs         | 443 +++++++++++++---
 crates/bevy_render/src/batching/mod.rs        |  18 +-
 .../src/batching/no_gpu_preprocessing.rs      |  23 +-
 crates/bevy_render/src/camera/camera.rs       |   9 +-
 crates/bevy_render/src/lib.rs                 |   2 +
 crates/bevy_render/src/maths.wgsl             |  18 +
 crates/bevy_render/src/render_phase/mod.rs    | 331 +++++++++---
 .../src/render_resource/buffer_vec.rs         |   5 +
 crates/bevy_render/src/view/mod.rs            |   6 +
 crates/bevy_render/src/view/visibility/mod.rs |   8 +-
 crates/bevy_sprite/src/mesh2d/material.rs     |   6 +-
 crates/bevy_sprite/src/mesh2d/mesh.rs         |   2 +-
 crates/bevy_sprite/src/render/mod.rs          |   6 +-
 crates/bevy_ui/src/render/mod.rs              |  10 +-
 crates/bevy_ui/src/render/render_pass.rs      |  11 +-
 .../src/render/ui_material_pipeline.rs        |   2 +-
 examples/2d/mesh2d_manual.rs                  |   7 +-
 examples/3d/deferred_rendering.rs             |  10 +-
 examples/3d/lighting.rs                       |   4 +-
 examples/shader/shader_instancing.rs          |   6 +-
 examples/stress_tests/many_cubes.rs           |  21 +-
 33 files changed, 1624 insertions(+), 460 deletions(-)

diff --git a/crates/bevy_core_pipeline/src/core_2d/mod.rs b/crates/bevy_core_pipeline/src/core_2d/mod.rs
index 48d2bb5a4f609f..85f986c111dd49 100644
--- a/crates/bevy_core_pipeline/src/core_2d/mod.rs
+++ b/crates/bevy_core_pipeline/src/core_2d/mod.rs
@@ -38,12 +38,11 @@ use bevy_render::{
     render_graph::{EmptyNode, RenderGraphApp, ViewNodeRunner},
     render_phase::{
         sort_phase_system, CachedRenderPipelinePhaseItem, DrawFunctionId, DrawFunctions, PhaseItem,
-        SortedPhaseItem, SortedRenderPhase,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
     },
     render_resource::CachedRenderPipelineId,
     Extract, ExtractSchedule, Render, RenderApp, RenderSet,
 };
-use nonmax::NonMaxU32;
 
 use crate::{tonemapping::TonemappingNode, upscaling::UpscalingNode};
 
@@ -91,7 +90,7 @@ pub struct Transparent2d {
     pub pipeline: CachedRenderPipelineId,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Transparent2d {
@@ -116,13 +115,13 @@ impl PhaseItem for Transparent2d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
diff --git a/crates/bevy_core_pipeline/src/core_3d/mod.rs b/crates/bevy_core_pipeline/src/core_3d/mod.rs
index 39aee1657136f6..6b4a91f24bff0f 100644
--- a/crates/bevy_core_pipeline/src/core_3d/mod.rs
+++ b/crates/bevy_core_pipeline/src/core_3d/mod.rs
@@ -57,7 +57,8 @@ use bevy_render::{
     render_graph::{EmptyNode, RenderGraphApp, ViewNodeRunner},
     render_phase::{
         sort_phase_system, BinnedPhaseItem, BinnedRenderPhase, CachedRenderPipelinePhaseItem,
-        DrawFunctionId, DrawFunctions, PhaseItem, SortedPhaseItem, SortedRenderPhase,
+        DrawFunctionId, DrawFunctions, PhaseItem, PhaseItemExtraIndex, SortedPhaseItem,
+        SortedRenderPhase,
     },
     render_resource::{
         BindGroupId, CachedRenderPipelineId, Extent3d, FilterMode, Sampler, SamplerDescriptor,
@@ -69,7 +70,6 @@ use bevy_render::{
     Extract, ExtractSchedule, Render, RenderApp, RenderSet,
 };
 use bevy_utils::{tracing::warn, HashMap};
-use nonmax::NonMaxU32;
 
 use crate::{
     core_3d::main_transmissive_pass_3d_node::MainTransmissivePass3dNode,
@@ -183,7 +183,7 @@ pub struct Opaque3d {
     /// The ranges of instances.
     pub batch_range: Range<u32>,
     /// The dynamic offset.
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 /// Data that must be identical in order to batch meshes together.
@@ -228,14 +228,12 @@ impl PhaseItem for Opaque3d {
         &mut self.batch_range
     }
 
-    #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
-    #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -247,13 +245,13 @@ impl BinnedPhaseItem for Opaque3d {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Opaque3d {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -269,7 +267,7 @@ pub struct AlphaMask3d {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for AlphaMask3d {
@@ -294,13 +292,13 @@ impl PhaseItem for AlphaMask3d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -312,13 +310,13 @@ impl BinnedPhaseItem for AlphaMask3d {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -336,7 +334,7 @@ pub struct Transmissive3d {
     pub entity: Entity,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Transmissive3d {
@@ -372,13 +370,13 @@ impl PhaseItem for Transmissive3d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -410,7 +408,7 @@ pub struct Transparent3d {
     pub entity: Entity,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Transparent3d {
@@ -435,13 +433,13 @@ impl PhaseItem for Transparent3d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
diff --git a/crates/bevy_core_pipeline/src/deferred/mod.rs b/crates/bevy_core_pipeline/src/deferred/mod.rs
index 3ccd8caad0e12b..13727318202244 100644
--- a/crates/bevy_core_pipeline/src/deferred/mod.rs
+++ b/crates/bevy_core_pipeline/src/deferred/mod.rs
@@ -5,10 +5,12 @@ use std::ops::Range;
 
 use bevy_ecs::prelude::*;
 use bevy_render::{
-    render_phase::{BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem},
+    render_phase::{
+        BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem,
+        PhaseItemExtraIndex,
+    },
     render_resource::{CachedRenderPipelineId, TextureFormat},
 };
-use nonmax::NonMaxU32;
 
 use crate::prepass::OpaqueNoLightmap3dBinKey;
 
@@ -26,7 +28,7 @@ pub struct Opaque3dDeferred {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Opaque3dDeferred {
@@ -51,13 +53,13 @@ impl PhaseItem for Opaque3dDeferred {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -69,13 +71,13 @@ impl BinnedPhaseItem for Opaque3dDeferred {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -96,7 +98,7 @@ pub struct AlphaMask3dDeferred {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for AlphaMask3dDeferred {
@@ -121,13 +123,13 @@ impl PhaseItem for AlphaMask3dDeferred {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -138,13 +140,13 @@ impl BinnedPhaseItem for AlphaMask3dDeferred {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
diff --git a/crates/bevy_core_pipeline/src/prepass/mod.rs b/crates/bevy_core_pipeline/src/prepass/mod.rs
index 01fca93ddc2543..73908613a62f20 100644
--- a/crates/bevy_core_pipeline/src/prepass/mod.rs
+++ b/crates/bevy_core_pipeline/src/prepass/mod.rs
@@ -34,11 +34,13 @@ use bevy_ecs::prelude::*;
 use bevy_reflect::Reflect;
 use bevy_render::{
     mesh::Mesh,
-    render_phase::{BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem},
+    render_phase::{
+        BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem,
+        PhaseItemExtraIndex,
+    },
     render_resource::{BindGroupId, CachedRenderPipelineId, Extent3d, TextureFormat, TextureView},
     texture::ColorAttachment,
 };
-use nonmax::NonMaxU32;
 
 pub const NORMAL_PREPASS_FORMAT: TextureFormat = TextureFormat::Rgb10a2Unorm;
 pub const MOTION_VECTOR_PREPASS_FORMAT: TextureFormat = TextureFormat::Rg16Float;
@@ -119,7 +121,7 @@ pub struct Opaque3dPrepass {
     pub representative_entity: Entity,
 
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 // TODO: Try interning these.
@@ -163,13 +165,13 @@ impl PhaseItem for Opaque3dPrepass {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -181,13 +183,13 @@ impl BinnedPhaseItem for Opaque3dPrepass {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Opaque3dPrepass {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -208,7 +210,7 @@ pub struct AlphaMask3dPrepass {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for AlphaMask3dPrepass {
@@ -233,13 +235,13 @@ impl PhaseItem for AlphaMask3dPrepass {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -251,13 +253,13 @@ impl BinnedPhaseItem for AlphaMask3dPrepass {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
diff --git a/crates/bevy_gizmos/src/pipeline_2d.rs b/crates/bevy_gizmos/src/pipeline_2d.rs
index 4fbf9544e22b5e..660cec02c92d34 100644
--- a/crates/bevy_gizmos/src/pipeline_2d.rs
+++ b/crates/bevy_gizmos/src/pipeline_2d.rs
@@ -18,7 +18,9 @@ use bevy_ecs::{
 use bevy_math::FloatOrd;
 use bevy_render::{
     render_asset::{prepare_assets, RenderAssets},
-    render_phase::{AddRenderCommand, DrawFunctions, SetItemPipeline, SortedRenderPhase},
+    render_phase::{
+        AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline, SortedRenderPhase,
+    },
     render_resource::*,
     texture::BevyDefault,
     view::{ExtractedView, Msaa, RenderLayers, ViewTarget},
@@ -293,7 +295,7 @@ fn queue_line_gizmos_2d(
                 pipeline,
                 sort_key: FloatOrd(f32::INFINITY),
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
@@ -351,7 +353,7 @@ fn queue_line_joint_gizmos_2d(
                 pipeline,
                 sort_key: FloatOrd(f32::INFINITY),
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_gizmos/src/pipeline_3d.rs b/crates/bevy_gizmos/src/pipeline_3d.rs
index ec9800330d1b72..e247220d541bcb 100644
--- a/crates/bevy_gizmos/src/pipeline_3d.rs
+++ b/crates/bevy_gizmos/src/pipeline_3d.rs
@@ -22,7 +22,9 @@ use bevy_ecs::{
 use bevy_pbr::{MeshPipeline, MeshPipelineKey, SetMeshViewBindGroup};
 use bevy_render::{
     render_asset::{prepare_assets, RenderAssets},
-    render_phase::{AddRenderCommand, DrawFunctions, SetItemPipeline, SortedRenderPhase},
+    render_phase::{
+        AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline, SortedRenderPhase,
+    },
     render_resource::*,
     texture::BevyDefault,
     view::{ExtractedView, Msaa, RenderLayers, ViewTarget},
@@ -348,7 +350,7 @@ fn queue_line_gizmos_3d(
                 pipeline,
                 distance: 0.,
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
@@ -437,7 +439,7 @@ fn queue_line_joint_gizmos_3d(
                 pipeline,
                 distance: 0.,
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index a616db5b6973c2..5f090dcaa4c259 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -704,7 +704,7 @@ pub fn queue_material_meshes<M: Material>(
                             pipeline: pipeline_id,
                             distance,
                             batch_range: 0..1,
-                            dynamic_offset: None,
+                            extra_index: PhaseItemExtraIndex::NONE,
                         });
                     } else if material.properties.render_method == OpaqueRendererMethod::Forward {
                         let bin_key = Opaque3dBinKey {
@@ -727,7 +727,7 @@ pub fn queue_material_meshes<M: Material>(
                             pipeline: pipeline_id,
                             distance,
                             batch_range: 0..1,
-                            dynamic_offset: None,
+                            extra_index: PhaseItemExtraIndex::NONE,
                         });
                     } else if material.properties.render_method == OpaqueRendererMethod::Forward {
                         let bin_key = OpaqueNoLightmap3dBinKey {
@@ -755,7 +755,7 @@ pub fn queue_material_meshes<M: Material>(
                         pipeline: pipeline_id,
                         distance,
                         batch_range: 0..1,
-                        dynamic_offset: None,
+                        extra_index: PhaseItemExtraIndex::NONE,
                     });
                 }
             }
diff --git a/crates/bevy_pbr/src/meshlet/cull_meshlets.wgsl b/crates/bevy_pbr/src/meshlet/cull_meshlets.wgsl
index 015ed6ee11ff3c..e695920bd3b59b 100644
--- a/crates/bevy_pbr/src/meshlet/cull_meshlets.wgsl
+++ b/crates/bevy_pbr/src/meshlet/cull_meshlets.wgsl
@@ -11,7 +11,10 @@
 #ifdef MESHLET_SECOND_CULLING_PASS
 #import bevy_pbr::meshlet_bindings::depth_pyramid
 #endif
-#import bevy_render::maths::affine3_to_square
+#import bevy_render::maths::{
+    affine3_to_square,
+    sphere_is_inside_frustum_plane,
+}
 
 /// Culls individual clusters (1 per thread) in two passes (two pass occlusion culling), and outputs a bitmask of which clusters survived.
 /// 1. The first pass is only frustum culling, on only the clusters that were visible last frame.
@@ -46,7 +49,11 @@ fn cull_meshlets(@builtin(global_invocation_id) cluster_id: vec3<u32>) {
     // TODO: Faster method from https://vkguide.dev/docs/gpudriven/compute_culling/#frustum-culling-function
     for (var i = 0u; i < 6u; i++) {
         if !meshlet_visible { break; }
-        meshlet_visible &= dot(view.frustum[i], bounding_sphere_center) > -bounding_sphere_radius;
+        meshlet_visible &= sphere_is_inside_frustum_plane(
+            view.frustum[i],
+            bounding_sphere_center,
+            bounding_sphere_radius
+        );
     }
 
 #ifdef MESHLET_SECOND_CULLING_PASS
diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 21eff19668a653..b4520d1efabc1b 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -14,27 +14,34 @@ use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
 use bevy_ecs::{
     component::Component,
     entity::Entity,
-    query::QueryState,
+    query::{Has, QueryState},
     schedule::{common_conditions::resource_exists, IntoSystemConfigs as _},
     system::{lifetimeless::Read, Commands, Res, ResMut, Resource},
     world::{FromWorld, World},
 };
 use bevy_render::{
-    batching::gpu_preprocessing::{self, BatchedInstanceBuffers, PreprocessWorkItem},
+    batching::gpu_preprocessing::{
+        self, BatchedInstanceBuffers, IndirectParameters, IndirectParametersBuffer,
+        PreprocessWorkItem,
+    },
     render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
     render_resource::{
-        binding_types::{storage_buffer, storage_buffer_read_only},
+        binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer},
         BindGroup, BindGroupEntries, BindGroupLayout, BindingResource, BufferBinding,
         CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor,
         DynamicBindGroupLayoutEntries, PipelineCache, Shader, ShaderStages, ShaderType,
         SpecializedComputePipeline, SpecializedComputePipelines,
     },
-    renderer::{RenderContext, RenderDevice},
+    renderer::{RenderContext, RenderDevice, RenderQueue},
+    view::{GpuCulling, ViewUniform, ViewUniformOffset, ViewUniforms},
     Render, RenderApp, RenderSet,
 };
 use bevy_utils::tracing::warn;
+use smallvec::{smallvec, SmallVec};
 
-use crate::{graph::NodePbr, MeshInputUniform, MeshUniform};
+use crate::{
+    graph::NodePbr, MeshCullingData, MeshCullingDataBuffer, MeshInputUniform, MeshUniform,
+};
 
 /// The handle to the `mesh_preprocess.wgsl` compute shader.
 pub const MESH_PREPROCESS_SHADER_HANDLE: Handle<Shader> =
@@ -57,20 +64,44 @@ pub struct GpuMeshPreprocessPlugin {
 
 /// The render node for the mesh uniform building pass.
 pub struct GpuPreprocessNode {
-    view_query: QueryState<(Entity, Read<PreprocessBindGroup>)>,
+    view_query: QueryState<(
+        Entity,
+        Read<PreprocessBindGroup>,
+        Read<ViewUniformOffset>,
+        Has<GpuCulling>,
+    )>,
 }
 
-/// The compute shader pipeline for the mesh uniform building pass.
+/// The compute shader pipelines for the mesh uniform building pass.
 #[derive(Resource)]
+pub struct PreprocessPipelines {
+    /// The pipeline used for CPU culling. This pipeline doesn't populate
+    /// indirect parameters.
+    pub direct: PreprocessPipeline,
+    /// The pipeline used for GPU culling. This pipeline populates indirect
+    /// parameters.
+    pub gpu_culling: PreprocessPipeline,
+}
+
+/// The pipeline for the GPU mesh preprocessing shader.
 pub struct PreprocessPipeline {
-    /// The single bind group layout for the compute shader.
+    /// The bind group layout for the compute shader.
     pub bind_group_layout: BindGroupLayout,
     /// The pipeline ID for the compute shader.
     ///
-    /// This gets filled in in `prepare_preprocess_pipeline`.
+    /// This gets filled in in `prepare_preprocess_pipelines`.
     pub pipeline_id: Option<CachedComputePipelineId>,
 }
 
+/// Specifies variants of the mesh preprocessing shader.
+#[derive(Clone, Copy, PartialEq, Eq, Hash)]
+pub struct PreprocessPipelineKey {
+    /// Whether GPU culling is in use.
+    ///
+    /// This `#define`'s `GPU_CULLING` in the shader.
+    pub gpu_culling: bool,
+}
+
 /// The compute shader bind group for the mesh uniform building pass.
 ///
 /// This goes on the view.
@@ -85,6 +116,23 @@ impl Plugin for GpuMeshPreprocessPlugin {
             "mesh_preprocess.wgsl",
             Shader::from_wgsl
         );
+
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.add_systems(
+            Render,
+            (
+                prepare_preprocess_pipelines.in_set(RenderSet::Prepare),
+                prepare_preprocess_bind_groups
+                    .run_if(
+                        resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+                    )
+                    .in_set(RenderSet::PrepareBindGroups),
+                write_mesh_culling_data_buffer.in_set(RenderSet::PrepareResourcesFlush),
+            ),
+        );
     }
 
     fn finish(&self, app: &mut App) {
@@ -106,12 +154,12 @@ impl Plugin for GpuMeshPreprocessPlugin {
             .add_render_graph_node::<GpuPreprocessNode>(Core3d, NodePbr::GpuPreprocess)
             .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, Node3d::Prepass))
             .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, NodePbr::ShadowPass))
-            .init_resource::<PreprocessPipeline>()
+            .init_resource::<PreprocessPipelines>()
             .init_resource::<SpecializedComputePipelines<PreprocessPipeline>>()
             .add_systems(
                 Render,
                 (
-                    prepare_preprocess_pipeline.in_set(RenderSet::Prepare),
+                    prepare_preprocess_pipelines.in_set(RenderSet::Prepare),
                     prepare_preprocess_bind_groups
                         .run_if(
                             resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
@@ -148,18 +196,7 @@ impl Node for GpuPreprocessNode {
         } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
 
         let pipeline_cache = world.resource::<PipelineCache>();
-        let preprocess_pipeline = world.resource::<PreprocessPipeline>();
-
-        let Some(preprocess_pipeline_id) = preprocess_pipeline.pipeline_id else {
-            warn!("The build mesh uniforms pipeline wasn't created");
-            return Ok(());
-        };
-
-        let Some(preprocess_pipeline) = pipeline_cache.get_compute_pipeline(preprocess_pipeline_id)
-        else {
-            // This will happen while the pipeline is being compiled and is fine.
-            return Ok(());
-        };
+        let preprocess_pipelines = world.resource::<PreprocessPipelines>();
 
         let mut compute_pass =
             render_context
@@ -169,13 +206,43 @@ impl Node for GpuPreprocessNode {
                     timestamp_writes: None,
                 });
 
-        compute_pass.set_pipeline(preprocess_pipeline);
-
         // Run the compute passes.
-        for (view, bind_group) in self.view_query.iter_manual(world) {
-            let index_buffer = &index_buffers[&view];
-            compute_pass.set_bind_group(0, &bind_group.0, &[]);
-            let workgroup_count = index_buffer.len().div_ceil(WORKGROUP_SIZE);
+        for (view, bind_group, view_uniform_offset, gpu_culling) in
+            self.view_query.iter_manual(world)
+        {
+            // Grab the index buffer for this view.
+            let Some(index_buffer) = index_buffers.get(&view) else {
+                warn!("The preprocessing index buffer wasn't present");
+                return Ok(());
+            };
+
+            let maybe_pipeline_id = if gpu_culling {
+                preprocess_pipelines.gpu_culling.pipeline_id
+            } else {
+                preprocess_pipelines.direct.pipeline_id
+            };
+
+            let Some(preprocess_pipeline_id) = maybe_pipeline_id else {
+                warn!("The build mesh uniforms pipeline wasn't uploaded");
+                return Ok(());
+            };
+
+            let Some(preprocess_pipeline) =
+                pipeline_cache.get_compute_pipeline(preprocess_pipeline_id)
+            else {
+                // This will happen while the pipeline is being compiled and is fine.
+                return Ok(());
+            };
+
+            compute_pass.set_pipeline(preprocess_pipeline);
+
+            let mut dynamic_offsets: SmallVec<[u32; 1]> = smallvec![];
+            if gpu_culling {
+                dynamic_offsets.push(view_uniform_offset.offset);
+            }
+            compute_pass.set_bind_group(0, &bind_group.0, &dynamic_offsets);
+
+            let workgroup_count = index_buffer.buffer.len().div_ceil(WORKGROUP_SIZE);
             compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
         }
 
@@ -183,72 +250,147 @@ impl Node for GpuPreprocessNode {
     }
 }
 
+impl PreprocessPipelines {
+    pub(crate) fn pipelines_are_loaded(&self, pipeline_cache: &PipelineCache) -> bool {
+        self.direct.is_loaded(pipeline_cache) && self.gpu_culling.is_loaded(pipeline_cache)
+    }
+}
+
+impl PreprocessPipeline {
+    fn is_loaded(&self, pipeline_cache: &PipelineCache) -> bool {
+        self.pipeline_id
+            .is_some_and(|pipeline_id| pipeline_cache.get_compute_pipeline(pipeline_id).is_some())
+    }
+}
+
 impl SpecializedComputePipeline for PreprocessPipeline {
-    type Key = ();
+    type Key = PreprocessPipelineKey;
+
+    fn specialize(&self, key: Self::Key) -> ComputePipelineDescriptor {
+        let mut shader_defs = vec![];
+        if key.gpu_culling {
+            shader_defs.push("INDIRECT".into());
+            shader_defs.push("FRUSTUM_CULLING".into());
+        }
 
-    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
         ComputePipelineDescriptor {
-            label: Some("mesh preprocessing".into()),
+            label: Some(
+                format!(
+                    "mesh preprocessing ({})",
+                    if key.gpu_culling {
+                        "GPU culling"
+                    } else {
+                        "direct"
+                    }
+                )
+                .into(),
+            ),
             layout: vec![self.bind_group_layout.clone()],
             push_constant_ranges: vec![],
             shader: MESH_PREPROCESS_SHADER_HANDLE,
-            shader_defs: vec![],
+            shader_defs,
             entry_point: "main".into(),
         }
     }
 }
 
-impl FromWorld for PreprocessPipeline {
+impl FromWorld for PreprocessPipelines {
     fn from_world(world: &mut World) -> Self {
         let render_device = world.resource::<RenderDevice>();
 
-        let bind_group_layout_entries = DynamicBindGroupLayoutEntries::sequential(
-            ShaderStages::COMPUTE,
-            (
-                // `current_input`
-                storage_buffer_read_only::<MeshInputUniform>(false),
-                // `previous_input`
-                storage_buffer_read_only::<MeshInputUniform>(false),
-                // `indices`
-                storage_buffer_read_only::<PreprocessWorkItem>(false),
-                // `output`
-                storage_buffer::<MeshUniform>(false),
-            ),
+        let direct_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries();
+        let gpu_culling_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries()
+            .extend_sequential((
+                // `indirect_parameters`
+                storage_buffer::<IndirectParameters>(/*has_dynamic_offset=*/ false),
+                // `mesh_culling_data`
+                storage_buffer_read_only::<MeshCullingData>(/*has_dynamic_offset=*/ false),
+                // `view`
+                uniform_buffer::<ViewUniform>(/*has_dynamic_offset=*/ true),
+            ));
+
+        let direct_bind_group_layout = render_device.create_bind_group_layout(
+            "build mesh uniforms direct bind group layout",
+            &direct_bind_group_layout_entries,
         );
-
-        let bind_group_layout = render_device.create_bind_group_layout(
-            "build mesh uniforms bind group layout",
-            &bind_group_layout_entries,
+        let gpu_culling_bind_group_layout = render_device.create_bind_group_layout(
+            "build mesh uniforms GPU culling bind group layout",
+            &gpu_culling_bind_group_layout_entries,
         );
 
-        PreprocessPipeline {
-            bind_group_layout,
-            pipeline_id: None,
+        PreprocessPipelines {
+            direct: PreprocessPipeline {
+                bind_group_layout: direct_bind_group_layout,
+                pipeline_id: None,
+            },
+            gpu_culling: PreprocessPipeline {
+                bind_group_layout: gpu_culling_bind_group_layout,
+                pipeline_id: None,
+            },
         }
     }
 }
 
-/// A system that specializes the `mesh_preprocess.wgsl` pipeline if necessary.
-pub fn prepare_preprocess_pipeline(
+fn preprocess_direct_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries {
+    DynamicBindGroupLayoutEntries::sequential(
+        ShaderStages::COMPUTE,
+        (
+            // `current_input`
+            storage_buffer_read_only::<MeshInputUniform>(false),
+            // `previous_input`
+            storage_buffer_read_only::<MeshInputUniform>(false),
+            // `indices`
+            storage_buffer_read_only::<PreprocessWorkItem>(false),
+            // `output`
+            storage_buffer::<MeshUniform>(false),
+        ),
+    )
+}
+
+/// A system that specializes the `mesh_preprocess.wgsl` pipelines if necessary.
+pub fn prepare_preprocess_pipelines(
     pipeline_cache: Res<PipelineCache>,
     mut pipelines: ResMut<SpecializedComputePipelines<PreprocessPipeline>>,
-    mut preprocess_pipeline: ResMut<PreprocessPipeline>,
+    mut preprocess_pipelines: ResMut<PreprocessPipelines>,
 ) {
-    if preprocess_pipeline.pipeline_id.is_some() {
-        return;
-    }
+    preprocess_pipelines.direct.prepare(
+        &pipeline_cache,
+        &mut pipelines,
+        PreprocessPipelineKey { gpu_culling: false },
+    );
+    preprocess_pipelines.gpu_culling.prepare(
+        &pipeline_cache,
+        &mut pipelines,
+        PreprocessPipelineKey { gpu_culling: true },
+    );
+}
+
+impl PreprocessPipeline {
+    fn prepare(
+        &mut self,
+        pipeline_cache: &PipelineCache,
+        pipelines: &mut SpecializedComputePipelines<PreprocessPipeline>,
+        key: PreprocessPipelineKey,
+    ) {
+        if self.pipeline_id.is_some() {
+            return;
+        }
 
-    let preprocess_pipeline_id = pipelines.specialize(&pipeline_cache, &preprocess_pipeline, ());
-    preprocess_pipeline.pipeline_id = Some(preprocess_pipeline_id);
+        let preprocess_pipeline_id = pipelines.specialize(pipeline_cache, self, key);
+        self.pipeline_id = Some(preprocess_pipeline_id);
+    }
 }
 
-/// A system that attaches the mesh uniform buffers to the bind group for the
-/// compute shader.
+/// A system that attaches the mesh uniform buffers to the bind groups for the
+/// variants of the mesh preprocessing compute shader.
 pub fn prepare_preprocess_bind_groups(
     mut commands: Commands,
     render_device: Res<RenderDevice>,
     batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
-    pipeline: Res<PreprocessPipeline>,
+    indirect_parameters_buffer: Res<IndirectParametersBuffer>,
+    mesh_culling_data_buffer: Res<MeshCullingDataBuffer>,
+    view_uniforms: Res<ViewUniforms>,
+    pipelines: Res<PreprocessPipelines>,
 ) {
     // Grab the `BatchedInstanceBuffers`.
     let BatchedInstanceBuffers {
@@ -267,7 +409,7 @@ pub fn prepare_preprocess_bind_groups(
     };
 
     for (view, index_buffer_vec) in index_buffers {
-        let Some(index_buffer) = index_buffer_vec.buffer() else {
+        let Some(index_buffer) = index_buffer_vec.buffer.buffer() else {
             continue;
         };
 
@@ -275,15 +417,27 @@ pub fn prepare_preprocess_bind_groups(
         // length and the underlying buffer may be longer than the actual size
         // of the vector.
         let index_buffer_size = NonZeroU64::try_from(
-            index_buffer_vec.len() as u64 * u64::from(PreprocessWorkItem::min_size()),
+            index_buffer_vec.buffer.len() as u64 * u64::from(PreprocessWorkItem::min_size()),
         )
         .ok();
 
-        commands
-            .entity(*view)
-            .insert(PreprocessBindGroup(render_device.create_bind_group(
-                "preprocess_bind_group",
-                &pipeline.bind_group_layout,
+        let bind_group = if index_buffer_vec.gpu_culling {
+            let (
+                Some(indirect_parameters_buffer),
+                Some(mesh_culling_data_buffer),
+                Some(view_uniforms_binding),
+            ) = (
+                indirect_parameters_buffer.buffer(),
+                mesh_culling_data_buffer.buffer(),
+                view_uniforms.uniforms.binding(),
+            )
+            else {
+                continue;
+            };
+
+            PreprocessBindGroup(render_device.create_bind_group(
+                "preprocess_gpu_culling_bind_group",
+                &pipelines.gpu_culling.bind_group_layout,
                 &BindGroupEntries::sequential((
                     current_input_buffer.as_entire_binding(),
                     previous_input_buffer.as_entire_binding(),
@@ -293,7 +447,38 @@ pub fn prepare_preprocess_bind_groups(
                         size: index_buffer_size,
                     }),
                     data_buffer.as_entire_binding(),
+                    indirect_parameters_buffer.as_entire_binding(),
+                    mesh_culling_data_buffer.as_entire_binding(),
+                    view_uniforms_binding,
                 )),
-            )));
+            ))
+        } else {
+            PreprocessBindGroup(render_device.create_bind_group(
+                "preprocess_indirect_bind_group",
+                &pipelines.direct.bind_group_layout,
+                &BindGroupEntries::sequential((
+                    current_input_buffer.as_entire_binding(),
+                    previous_input_buffer.as_entire_binding(),
+                    BindingResource::Buffer(BufferBinding {
+                        buffer: index_buffer,
+                        offset: 0,
+                        size: index_buffer_size,
+                    }),
+                    data_buffer.as_entire_binding(),
+                )),
+            ))
+        };
+
+        commands.entity(*view).insert(bind_group);
     }
 }
+
+/// Writes the information needed to do GPU mesh culling to the GPU.
+pub fn write_mesh_culling_data_buffer(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
+) {
+    mesh_culling_data_buffer.write_buffer(&render_device, &render_queue);
+    mesh_culling_data_buffer.clear();
+}
diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs
index 070e1a9d7b99da..c2e42710f941d4 100644
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@@ -22,7 +22,6 @@ use bevy_transform::{components::GlobalTransform, prelude::Transform};
 #[cfg(feature = "trace")]
 use bevy_utils::tracing::info_span;
 use bevy_utils::tracing::{error, warn};
-use nonmax::NonMaxU32;
 use std::{hash::Hash, num::NonZeroU64, ops::Range};
 
 use crate::*;
@@ -1733,7 +1732,7 @@ pub struct Shadow {
     pub key: ShadowBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
@@ -1770,13 +1769,13 @@ impl PhaseItem for Shadow {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -1788,13 +1787,13 @@ impl BinnedPhaseItem for Shadow {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Shadow {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index ebe7274f7b8dba..d837baab938b5d 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -15,10 +15,13 @@ use bevy_ecs::{
 use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
     batching::{
-        clear_batched_instance_buffers, gpu_preprocessing, no_gpu_preprocessing, GetBatchData,
-        GetFullBatchData, NoAutomaticBatching,
+        clear_batched_instance_buffers,
+        gpu_preprocessing::{self, IndirectParameters, IndirectParametersBuffer},
+        no_gpu_preprocessing, GetBatchData, GetFullBatchData, NoAutomaticBatching,
     },
+    camera::Camera,
     mesh::*,
+    primitives::Aabb,
     render_asset::RenderAssets,
     render_phase::{
         BinnedRenderPhasePlugin, PhaseItem, RenderCommand, RenderCommandResult,
@@ -27,11 +30,11 @@ use bevy_render::{
     render_resource::*,
     renderer::{RenderDevice, RenderQueue},
     texture::{BevyDefault, DefaultImageSampler, ImageSampler, TextureFormatPixelInfo},
-    view::{prepare_view_targets, ViewTarget, ViewUniformOffset, ViewVisibility},
+    view::{prepare_view_targets, GpuCulling, ViewTarget, ViewUniformOffset, ViewVisibility},
     Extract,
 };
 use bevy_transform::components::GlobalTransform;
-use bevy_utils::{tracing::error, Entry, HashMap, Parallel};
+use bevy_utils::{tracing::error, tracing::warn, Entry, HashMap, Parallel};
 
 #[cfg(debug_assertions)]
 use bevy_utils::warn_once;
@@ -139,6 +142,7 @@ impl Plugin for MeshRenderPlugin {
                 .init_resource::<SkinIndices>()
                 .init_resource::<MorphUniform>()
                 .init_resource::<MorphIndices>()
+                .init_resource::<MeshCullingDataBuffer>()
                 .add_systems(ExtractSchedule, (extract_skins, extract_morphs))
                 .add_systems(
                     ExtractSchedule,
@@ -202,6 +206,8 @@ impl Plugin for MeshRenderPlugin {
                     );
             };
 
+            let indirect_parameters_buffer = IndirectParametersBuffer::new();
+
             let render_device = render_app.world().resource::<RenderDevice>();
             if let Some(per_object_buffer_batch_size) =
                 GpuArrayBuffer::<MeshUniform>::batch_size(render_device)
@@ -212,7 +218,9 @@ impl Plugin for MeshRenderPlugin {
                 ));
             }
 
-            render_app.init_resource::<MeshPipeline>();
+            render_app
+                .insert_resource(indirect_parameters_buffer)
+                .init_resource::<MeshPipeline>();
         }
 
         // Load the mesh_bindings shader module here as it depends on runtime information about
@@ -288,6 +296,29 @@ pub struct MeshInputUniform {
     pub previous_input_index: u32,
 }
 
+/// Information about each mesh instance needed to cull it on GPU.
+///
+/// At the moment, this just consists of its axis-aligned bounding box (AABB).
+#[derive(ShaderType, Pod, Zeroable, Clone, Copy)]
+#[repr(C)]
+pub struct MeshCullingData {
+    /// The 3D center of the AABB in model space, padded with an extra unused
+    /// float value.
+    pub aabb_center: Vec4,
+    /// The 3D extents of the AABB in model space, divided by two, padded with
+    /// an extra unused float value.
+    pub aabb_half_extents: Vec4,
+}
+
+/// A GPU buffer that holds the information needed to cull meshes on GPU.
+///
+/// At the moment, this simply holds each mesh's AABB.
+///
+/// To avoid wasting CPU time in the CPU culling case, this buffer will be empty
+/// if GPU culling isn't in use.
+#[derive(Resource, Deref, DerefMut)]
+pub struct MeshCullingDataBuffer(BufferVec<MeshCullingData>);
+
 impl MeshUniform {
     pub fn new(mesh_transforms: &MeshTransforms, maybe_lightmap_uv_rect: Option<Rect>) -> Self {
         let (inverse_transpose_model_a, inverse_transpose_model_b) =
@@ -422,6 +453,46 @@ pub struct RenderMeshInstanceGpuBuilder {
     pub mesh_flags: MeshFlags,
 }
 
+/// Holds information that will be needed to construct the [`MeshCullingData`].
+///
+/// This is the same as [`MeshCullingData`], but avoids the extra padding to
+/// save a bit of CPU memory bandwidth.
+pub struct MeshCullingDataGpuBuilder {
+    /// The 3D center of the AABB in model space.
+    pub aabb_center: Vec3,
+    /// The 3D extents of the AABB in model space, divided by two.
+    pub aabb_half_extents: Vec3,
+}
+
+/// The per-thread queues used during [`extract_meshes_for_gpu_building`].
+///
+/// There are two varieties of these: one for when culling happens on CPU and
+/// one for when culling happens on GPU. Having the two varieties avoids wasting
+/// space if GPU culling is disabled.
+#[derive(Default)]
+pub enum RenderMeshInstanceGpuQueue {
+    /// The default value.
+    ///
+    /// This becomes [`RenderMeshInstanceGpuQueue::CpuCulling`] or
+    /// [`RenderMeshInstanceGpuQueue::GpuCulling`] once extraction starts.
+    #[default]
+    None,
+    /// The version of [`RenderMeshInstanceGpuQueue`] that omits the
+    /// [`MeshCullingDataGpuBuilder`], so that we don't waste space when GPU
+    /// culling is disabled.
+    CpuCulling(Vec<(Entity, RenderMeshInstanceGpuBuilder)>),
+    /// The version of [`RenderMeshInstanceGpuQueue`] that contains the
+    /// [`MeshCullingDataGpuBuilder`], used when any view has GPU culling
+    /// enabled.
+    GpuCulling(
+        Vec<(
+            Entity,
+            RenderMeshInstanceGpuBuilder,
+            MeshCullingDataGpuBuilder,
+        )>,
+    ),
+}
+
 impl RenderMeshInstanceShared {
     fn from_components(
         previous_transform: Option<&PreviousGlobalTransform>,
@@ -512,44 +583,167 @@ impl RenderMeshInstances {
     }
 }
 
-pub(crate) trait RenderMeshInstancesTable {
-    /// Returns the ID of the mesh asset attached to the given entity, if any.
-    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>>;
+impl RenderMeshInstancesCpu {
+    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
+        self.get(&entity)
+            .map(|render_mesh_instance| render_mesh_instance.mesh_asset_id)
+    }
 
-    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
-    /// mesh attached.
-    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData>;
+    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+        self.get(&entity)
+            .map(|render_mesh_instance| RenderMeshQueueData {
+                shared: &render_mesh_instance.shared,
+                translation: render_mesh_instance.transforms.transform.translation,
+            })
+    }
 }
 
-impl RenderMeshInstancesTable for RenderMeshInstancesCpu {
+impl RenderMeshInstancesGpu {
     fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
-        self.get(&entity).map(|instance| instance.mesh_asset_id)
+        self.get(&entity)
+            .map(|render_mesh_instance| render_mesh_instance.mesh_asset_id)
     }
 
     fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
-        self.get(&entity).map(|instance| RenderMeshQueueData {
-            shared: &instance.shared,
-            translation: instance.transforms.transform.translation,
-        })
+        self.get(&entity)
+            .map(|render_mesh_instance| RenderMeshQueueData {
+                shared: &render_mesh_instance.shared,
+                translation: render_mesh_instance.translation,
+            })
     }
 }
 
-impl RenderMeshInstancesTable for RenderMeshInstancesGpu {
-    /// Returns the ID of the mesh asset attached to the given entity, if any.
-    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
-        self.get(&entity).map(|instance| instance.mesh_asset_id)
+impl RenderMeshInstanceGpuQueue {
+    /// Clears out a [`RenderMeshInstanceGpuQueue`], creating or recreating it
+    /// as necessary.
+    ///
+    /// `any_gpu_culling` should be set to true if any view has GPU culling
+    /// enabled.
+    fn init(&mut self, any_gpu_culling: bool) {
+        match (any_gpu_culling, &mut *self) {
+            (true, RenderMeshInstanceGpuQueue::GpuCulling(queue)) => queue.clear(),
+            (true, _) => *self = RenderMeshInstanceGpuQueue::GpuCulling(vec![]),
+            (false, RenderMeshInstanceGpuQueue::CpuCulling(queue)) => queue.clear(),
+            (false, _) => *self = RenderMeshInstanceGpuQueue::CpuCulling(vec![]),
+        }
     }
 
-    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
-    /// mesh attached.
-    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
-        self.get(&entity).map(|instance| RenderMeshQueueData {
-            shared: &instance.shared,
-            translation: instance.translation,
+    /// Adds a new mesh to this queue.
+    fn push(
+        &mut self,
+        entity: Entity,
+        instance_builder: RenderMeshInstanceGpuBuilder,
+        culling_data_builder: Option<MeshCullingDataGpuBuilder>,
+    ) {
+        match (&mut *self, culling_data_builder) {
+            (&mut RenderMeshInstanceGpuQueue::CpuCulling(ref mut queue), None) => {
+                queue.push((entity, instance_builder));
+            }
+            (
+                &mut RenderMeshInstanceGpuQueue::GpuCulling(ref mut queue),
+                Some(culling_data_builder),
+            ) => {
+                queue.push((entity, instance_builder, culling_data_builder));
+            }
+            (_, None) => {
+                *self = RenderMeshInstanceGpuQueue::CpuCulling(vec![(entity, instance_builder)]);
+            }
+            (_, Some(culling_data_builder)) => {
+                *self = RenderMeshInstanceGpuQueue::GpuCulling(vec![(
+                    entity,
+                    instance_builder,
+                    culling_data_builder,
+                )]);
+            }
+        }
+    }
+}
+
+impl RenderMeshInstanceGpuBuilder {
+    /// Flushes this mesh instance to the [`RenderMeshInstanceGpu`] and
+    /// [`MeshInputUniform`] tables.
+    fn add_to(
+        self,
+        entity: Entity,
+        render_mesh_instances: &mut EntityHashMap<RenderMeshInstanceGpu>,
+        prev_render_mesh_instances: &mut EntityHashMap<RenderMeshInstanceGpu>,
+        current_input_buffer: &mut BufferVec<MeshInputUniform>,
+    ) -> usize {
+        let previous_input_index = if self
+            .shared
+            .flags
+            .contains(RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM)
+        {
+            prev_render_mesh_instances
+                .get(&entity)
+                .map(|render_mesh_instance| render_mesh_instance.current_uniform_index)
+        } else {
+            None
+        };
+
+        // Push the mesh input uniform.
+        let current_uniform_index = current_input_buffer.push(MeshInputUniform {
+            transform: self.transform.to_transpose(),
+            lightmap_uv_rect: self.lightmap_uv_rect,
+            flags: self.mesh_flags.bits(),
+            previous_input_index: match previous_input_index {
+                Some(previous_input_index) => previous_input_index.into(),
+                None => u32::MAX,
+            },
+        });
+
+        // Record the [`RenderMeshInstance`].
+        render_mesh_instances.insert(
+            entity,
+            RenderMeshInstanceGpu {
+                translation: self.transform.translation,
+                shared: self.shared,
+                current_uniform_index: (current_uniform_index as u32)
+                    .try_into()
+                    .unwrap_or_default(),
+            },
+        );
+
+        current_uniform_index
+    }
+}
+
+impl MeshCullingDataGpuBuilder {
+    /// Returns a new [`MeshCullingDataGpuBuilder`] initialized with the given
+    /// AABB.
+    ///
+    /// If no AABB is provided, an infinitely-large one is conservatively
+    /// chosen.
+    fn new(aabb: Option<&Aabb>) -> Self {
+        match aabb {
+            Some(aabb) => MeshCullingDataGpuBuilder {
+                aabb_center: aabb.center.into(),
+                aabb_half_extents: aabb.half_extents.into(),
+            },
+            None => MeshCullingDataGpuBuilder {
+                aabb_center: Vec3::ZERO,
+                aabb_half_extents: Vec3::INFINITY,
+            },
+        }
+    }
+
+    /// Flushes this mesh instance culling data to the
+    /// [`MeshCullingDataBuffer`].
+    fn add_to(&self, mesh_culling_data_buffer: &mut MeshCullingDataBuffer) -> usize {
+        mesh_culling_data_buffer.push(MeshCullingData {
+            aabb_center: self.aabb_center.extend(0.0),
+            aabb_half_extents: self.aabb_half_extents.extend(0.0),
         })
     }
 }
 
+impl Default for MeshCullingDataBuffer {
+    #[inline]
+    fn default() -> Self {
+        Self(BufferVec::new(BufferUsages::STORAGE))
+    }
+}
+
 /// Data that [`crate::material::queue_material_meshes`] and similar systems
 /// need in order to place entities that contain meshes in the right batch.
 #[derive(Deref)]
@@ -659,7 +853,8 @@ pub fn extract_meshes_for_gpu_building(
     mut batched_instance_buffers: ResMut<
         gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
     >,
-    mut render_mesh_instance_queues: Local<Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>>,
+    mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
+    mut render_mesh_instance_queues: Local<Parallel<RenderMeshInstanceGpuQueue>>,
     mut prev_render_mesh_instances: Local<RenderMeshInstancesGpu>,
     meshes_query: Extract<
         Query<(
@@ -668,6 +863,7 @@ pub fn extract_meshes_for_gpu_building(
             &GlobalTransform,
             Option<&PreviousGlobalTransform>,
             Option<&Lightmap>,
+            Option<&Aabb>,
             &Handle<Mesh>,
             Has<NotShadowReceiver>,
             Has<TransmittedShadowReceiver>,
@@ -675,7 +871,13 @@ pub fn extract_meshes_for_gpu_building(
             Has<NoAutomaticBatching>,
         )>,
     >,
+    cameras_query: Extract<Query<Entity, (With<Camera>, With<GpuCulling>)>>,
 ) {
+    let any_gpu_culling = !cameras_query.is_empty();
+    for render_mesh_instance_queue in render_mesh_instance_queues.iter_mut() {
+        render_mesh_instance_queue.init(any_gpu_culling);
+    }
+
     meshes_query.par_iter().for_each(
         |(
             entity,
@@ -683,6 +885,7 @@ pub fn extract_meshes_for_gpu_building(
             transform,
             previous_transform,
             lightmap,
+            aabb,
             handle,
             not_shadow_receiver,
             transmitted_receiver,
@@ -706,16 +909,22 @@ pub fn extract_meshes_for_gpu_building(
             let lightmap_uv_rect =
                 lightmap::pack_lightmap_uv_rect(lightmap.map(|lightmap| lightmap.uv_rect));
 
+            let gpu_mesh_culling_data_builder =
+                any_gpu_culling.then(|| MeshCullingDataGpuBuilder::new(aabb));
+
+            let gpu_mesh_instance_builder = RenderMeshInstanceGpuBuilder {
+                shared,
+                transform: (&transform.affine()).into(),
+                lightmap_uv_rect,
+                mesh_flags,
+            };
+
             render_mesh_instance_queues.scope(|queue| {
-                queue.push((
+                queue.push(
                     entity,
-                    RenderMeshInstanceGpuBuilder {
-                        shared,
-                        transform: (&transform.affine()).into(),
-                        lightmap_uv_rect,
-                        mesh_flags,
-                    },
-                ));
+                    gpu_mesh_instance_builder,
+                    gpu_mesh_culling_data_builder,
+                );
             });
         },
     );
@@ -723,6 +932,7 @@ pub fn extract_meshes_for_gpu_building(
     collect_meshes_for_gpu_building(
         &mut render_mesh_instances,
         &mut batched_instance_buffers,
+        &mut mesh_culling_data_buffer,
         &mut render_mesh_instance_queues,
         &mut prev_render_mesh_instances,
     );
@@ -736,7 +946,8 @@ fn collect_meshes_for_gpu_building(
         MeshUniform,
         MeshInputUniform,
     >,
-    render_mesh_instance_queues: &mut Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>,
+    mesh_culling_data_buffer: &mut MeshCullingDataBuffer,
+    render_mesh_instance_queues: &mut Parallel<RenderMeshInstanceGpuQueue>,
     prev_render_mesh_instances: &mut RenderMeshInstancesGpu,
 ) {
     // Collect render mesh instances. Build up the uniform buffer.
@@ -760,41 +971,32 @@ fn collect_meshes_for_gpu_building(
 
     // Build the [`RenderMeshInstance`]s and [`MeshInputUniform`]s.
     render_mesh_instances.clear();
-    for queue in render_mesh_instance_queues.iter_mut() {
-        for (entity, builder) in queue.drain(..) {
-            let previous_input_index = if builder
-                .shared
-                .flags
-                .contains(RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM)
-            {
-                prev_render_mesh_instances
-                    .get(&entity)
-                    .map(|render_mesh_instance| render_mesh_instance.current_uniform_index)
-            } else {
-                None
-            };
 
-            // Push the mesh input uniform.
-            let current_uniform_index = current_input_buffer.push(MeshInputUniform {
-                transform: builder.transform.to_transpose(),
-                lightmap_uv_rect: builder.lightmap_uv_rect,
-                flags: builder.mesh_flags.bits(),
-                previous_input_index: match previous_input_index {
-                    Some(previous_input_index) => previous_input_index.into(),
-                    None => u32::MAX,
-                },
-            }) as u32;
-
-            // Record the [`RenderMeshInstance`].
-            render_mesh_instances.insert(
-                entity,
-                RenderMeshInstanceGpu {
-                    translation: builder.transform.translation,
-                    shared: builder.shared,
-                    current_uniform_index: NonMaxU32::try_from(current_uniform_index)
-                        .unwrap_or_default(),
-                },
-            );
+    for queue in render_mesh_instance_queues.iter_mut() {
+        match *queue {
+            RenderMeshInstanceGpuQueue::None => todo!(),
+            RenderMeshInstanceGpuQueue::CpuCulling(ref mut queue) => {
+                for (entity, mesh_instance_builder) in queue.drain(..) {
+                    mesh_instance_builder.add_to(
+                        entity,
+                        render_mesh_instances,
+                        prev_render_mesh_instances,
+                        current_input_buffer,
+                    );
+                }
+            }
+            RenderMeshInstanceGpuQueue::GpuCulling(ref mut queue) => {
+                for (entity, mesh_instance_builder, mesh_culling_builder) in queue.drain(..) {
+                    let instance_data_index = mesh_instance_builder.add_to(
+                        entity,
+                        render_mesh_instances,
+                        prev_render_mesh_instances,
+                        current_input_buffer,
+                    );
+                    let culling_data_index = mesh_culling_builder.add_to(mesh_culling_data_buffer);
+                    debug_assert_eq!(instance_data_index, culling_data_index);
+                }
+            }
         }
     }
 }
@@ -917,7 +1119,11 @@ impl MeshPipeline {
 }
 
 impl GetBatchData for MeshPipeline {
-    type Param = (SRes<RenderMeshInstances>, SRes<RenderLightmaps>);
+    type Param = (
+        SRes<RenderMeshInstances>,
+        SRes<RenderLightmaps>,
+        SRes<RenderAssets<GpuMesh>>,
+    );
     // The material bind group ID, the mesh ID, and the lightmap ID,
     // respectively.
     type CompareData = (MaterialBindGroupId, AssetId<Mesh>, Option<AssetId<Image>>);
@@ -925,7 +1131,7 @@ impl GetBatchData for MeshPipeline {
     type BufferData = MeshUniform;
 
     fn get_batch_data(
-        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -956,7 +1162,7 @@ impl GetFullBatchData for MeshPipeline {
     type BufferInputData = MeshInputUniform;
 
     fn get_index_and_compare_data(
-        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
         // This should only be called during GPU building.
@@ -982,7 +1188,7 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_binned_batch_data(
-        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<Self::BufferData> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -1001,7 +1207,7 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_binned_index(
-        (mesh_instances, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, _, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<NonMaxU32> {
         // This should only be called during GPU building.
@@ -1017,6 +1223,67 @@ impl GetFullBatchData for MeshPipeline {
             .get(&entity)
             .map(|entity| entity.current_uniform_index)
     }
+
+    fn get_batch_indirect_parameters_index(
+        (mesh_instances, _, meshes): &SystemParamItem<Self::Param>,
+        indirect_parameters_buffer: &mut IndirectParametersBuffer,
+        entity: Entity,
+        instance_index: u32,
+    ) -> Option<NonMaxU32> {
+        get_batch_indirect_parameters_index(
+            mesh_instances,
+            meshes,
+            indirect_parameters_buffer,
+            entity,
+            instance_index,
+        )
+    }
+}
+
+/// Pushes a set of [`IndirectParameters`] onto the [`IndirectParametersBuffer`]
+/// for the given mesh instance, and returns the index of those indirect
+/// parameters.
+fn get_batch_indirect_parameters_index(
+    mesh_instances: &RenderMeshInstances,
+    meshes: &RenderAssets<GpuMesh>,
+    indirect_parameters_buffer: &mut IndirectParametersBuffer,
+    entity: Entity,
+    instance_index: u32,
+) -> Option<NonMaxU32> {
+    // This should only be called during GPU building.
+    let RenderMeshInstances::GpuBuilding(ref mesh_instances) = *mesh_instances else {
+        error!(
+            "`get_batch_indirect_parameters_index` should never be called in CPU mesh uniform \
+                building mode"
+        );
+        return None;
+    };
+
+    let mesh_instance = mesh_instances.get(&entity)?;
+    let mesh = meshes.get(mesh_instance.mesh_asset_id)?;
+
+    let indirect_parameters = match mesh.buffer_info {
+        GpuBufferInfo::Indexed {
+            count: index_count, ..
+        } => IndirectParameters {
+            data0: index_count,
+            instance_count: 0,
+            first_vertex: 0,
+            data1: 0,
+            first_instance: instance_index,
+        },
+        GpuBufferInfo::NonIndexed => IndirectParameters {
+            data0: mesh.vertex_count,
+            instance_count: 0,
+            first_vertex: 0,
+            data1: instance_index,
+            first_instance: instance_index,
+        },
+    };
+
+    (indirect_parameters_buffer.push(indirect_parameters) as u32)
+        .try_into()
+        .ok()
 }
 
 bitflags::bitflags! {
@@ -1698,7 +1965,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
 
         let mut dynamic_offsets: [u32; 3] = Default::default();
         let mut offset_count = 0;
-        if let Some(dynamic_offset) = item.dynamic_offset() {
+        if let Some(dynamic_offset) = item.extra_index().as_dynamic_offset() {
             dynamic_offsets[offset_count] = dynamic_offset.get();
             offset_count += 1;
         }
@@ -1721,8 +1988,9 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
     type Param = (
         SRes<RenderAssets<GpuMesh>>,
         SRes<RenderMeshInstances>,
+        SRes<IndirectParametersBuffer>,
         SRes<PipelineCache>,
-        Option<SRes<PreprocessPipeline>>,
+        Option<SRes<PreprocessPipelines>>,
     );
     type ViewQuery = Has<PreprocessBindGroup>;
     type ItemQuery = ();
@@ -1731,25 +1999,15 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         item: &P,
         has_preprocess_bind_group: ROQueryItem<Self::ViewQuery>,
         _item_query: Option<()>,
-        (meshes, mesh_instances, pipeline_cache, preprocess_pipeline): SystemParamItem<
-            'w,
-            '_,
-            Self::Param,
-        >,
+        (meshes, mesh_instances, indirect_parameters_buffer, pipeline_cache, preprocess_pipelines): SystemParamItem<'w, '_, Self::Param>,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
         // If we're using GPU preprocessing, then we're dependent on that
         // compute shader having been run, which of course can only happen if
         // it's compiled. Otherwise, our mesh instance data won't be present.
-        if let Some(preprocess_pipeline) = preprocess_pipeline {
+        if let Some(preprocess_pipelines) = preprocess_pipelines {
             if !has_preprocess_bind_group
-                || !preprocess_pipeline
-                    .pipeline_id
-                    .is_some_and(|preprocess_pipeline_id| {
-                        pipeline_cache
-                            .get_compute_pipeline(preprocess_pipeline_id)
-                            .is_some()
-                    })
+                || !preprocess_pipelines.pipelines_are_loaded(&pipeline_cache)
             {
                 return RenderCommandResult::Failure;
             }
@@ -1757,6 +2015,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
 
         let meshes = meshes.into_inner();
         let mesh_instances = mesh_instances.into_inner();
+        let indirect_parameters_buffer = indirect_parameters_buffer.into_inner();
 
         let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else {
             return RenderCommandResult::Failure;
@@ -1765,6 +2024,21 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
             return RenderCommandResult::Failure;
         };
 
+        // Calculate the indirect offset, and look up the buffer.
+        let indirect_parameters = match item.extra_index().as_indirect_parameters_index() {
+            None => None,
+            Some(index) => match indirect_parameters_buffer.buffer() {
+                None => {
+                    warn!("Not rendering mesh because indirect parameters buffer wasn't present");
+                    return RenderCommandResult::Failure;
+                }
+                Some(buffer) => Some((
+                    index as u64 * mem::size_of::<IndirectParameters>() as u64,
+                    buffer,
+                )),
+            },
+        };
+
         pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..));
 
         let batch_range = item.batch_range();
@@ -1774,6 +2048,8 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
             0,
             &(batch_range.start as i32).to_le_bytes(),
         );
+
+        // Draw either directly or indirectly, as appropriate.
         match &gpu_mesh.buffer_info {
             GpuBufferInfo::Indexed {
                 buffer,
@@ -1781,11 +2057,25 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                 count,
             } => {
                 pass.set_index_buffer(buffer.slice(..), 0, *index_format);
-                pass.draw_indexed(0..*count, 0, batch_range.clone());
-            }
-            GpuBufferInfo::NonIndexed => {
-                pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
+                match indirect_parameters {
+                    None => {
+                        pass.draw_indexed(0..*count, 0, batch_range.clone());
+                    }
+                    Some((indirect_parameters_offset, indirect_parameters_buffer)) => pass
+                        .draw_indexed_indirect(
+                            indirect_parameters_buffer,
+                            indirect_parameters_offset,
+                        ),
+                }
             }
+            GpuBufferInfo::NonIndexed => match indirect_parameters {
+                None => {
+                    pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
+                }
+                Some((indirect_parameters_offset, indirect_parameters_buffer)) => {
+                    pass.draw_indirect(indirect_parameters_buffer, indirect_parameters_offset);
+                }
+            },
         }
         RenderCommandResult::Success
     }
diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
index c4adaa51056232..a6643701bc727f 100644
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -9,6 +9,7 @@
 
 #import bevy_pbr::mesh_types::Mesh
 #import bevy_render::maths
+#import bevy_render::view::View
 
 // Per-frame data that the CPU supplies to the GPU.
 struct MeshInput {
@@ -23,15 +24,45 @@ struct MeshInput {
     previous_input_index: u32,
 }
 
+// Information about each mesh instance needed to cull it on GPU.
+//
+// At the moment, this just consists of its axis-aligned bounding box (AABB).
+struct MeshCullingData {
+    // The 3D center of the AABB in model space, padded with an extra unused
+    // float value.
+    aabb_center: vec4<f32>,
+    // The 3D extents of the AABB in model space, divided by two, padded with
+    // an extra unused float value.
+    aabb_half_extents: vec4<f32>,
+}
+
 // One invocation of this compute shader: i.e. one mesh instance in a view.
 struct PreprocessWorkItem {
     // The index of the `MeshInput` in the `current_input` buffer that we read
     // from.
     input_index: u32,
-    // The index of the `Mesh` in `output` that we write to.
+    // In direct mode, the index of the `Mesh` in `output` that we write to. In
+    // indirect mode, the index of the `IndirectParameters` in
+    // `indirect_parameters` that we write to.
     output_index: u32,
 }
 
+// The `wgpu` indirect parameters structure. This is a union of two structures.
+// For more information, see the corresponding comment in
+// `gpu_preprocessing.rs`.
+struct IndirectParameters {
+    // `vertex_count` or `index_count`.
+    data0: u32,
+    // `instance_count` in both structures.
+    instance_count: atomic<u32>,
+    // `first_vertex` in both structures.
+    first_vertex: u32,
+    // `first_instance` or `base_vertex`.
+    data1: u32,
+    // A read-only copy of `instance_index`.
+    instance_index: u32,
+}
+
 // The current frame's `MeshInput`.
 @group(0) @binding(0) var<storage> current_input: array<MeshInput>;
 // The `MeshInput` values from the previous frame.
@@ -43,20 +74,103 @@ struct PreprocessWorkItem {
 // The output array of `Mesh`es.
 @group(0) @binding(3) var<storage, read_write> output: array<Mesh>;
 
+#ifdef INDIRECT
+// The array of indirect parameters for drawcalls.
+@group(0) @binding(4) var<storage, read_write> indirect_parameters: array<IndirectParameters>;
+#endif
+
+#ifdef FRUSTUM_CULLING
+// Data needed to cull the meshes.
+//
+// At the moment, this consists only of AABBs.
+@group(0) @binding(5) var<storage> mesh_culling_data: array<MeshCullingData>;
+
+// The view data, including the view matrix.
+@group(0) @binding(6) var<uniform> view: View;
+
+// Returns true if the view frustum intersects a sphere, specified in world
+// space.
+fn view_frustum_intersects_sphere(sphere_center_3: vec3<f32>, sphere_radius: f32) -> bool {
+    let sphere_center_4 = vec4(sphere_center_3, 1.0);
+    for (var i = 0; i < 5; i += 1) {
+        if (!maths::sphere_is_inside_frustum_plane(
+            view.frustum[i],
+            sphere_center_4,
+            sphere_radius
+        )) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// Returns true if the view frustum intersects an oriented bounding box (OBB).
+//
+// `aabb_center.w` should be 1.0.
+fn view_frustum_intersects_obb(
+    model: mat4x4<f32>,
+    aabb_center: vec4<f32>,
+    aabb_half_extents: vec3<f32>,
+) -> bool {
+
+    for (var i = 0; i < 5; i += 1) {
+        // Calculate relative radius of the sphere associated with this plane.
+        let plane_normal = view.frustum[i];
+        let relative_radius = dot(
+            abs(
+                vec3(
+                    dot(plane_normal, model[0]),
+                    dot(plane_normal, model[1]),
+                    dot(plane_normal, model[2]),
+                )
+            ),
+            aabb_half_extents
+        );
+
+        // Check the frustum plane.
+        if (!maths::sphere_is_inside_frustum_plane(plane_normal, aabb_center, relative_radius)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+#endif
+
 @compute
 @workgroup_size(64)
 fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
+    // Figure out our instance index. If this thread doesn't correspond to any
+    // index, bail.
     let instance_index = global_invocation_id.x;
     if (instance_index >= arrayLength(&work_items)) {
         return;
     }
 
     // Unpack.
-    let mesh_index = work_items[instance_index].input_index;
+    let input_index = work_items[instance_index].input_index;
     let output_index = work_items[instance_index].output_index;
-    let model_affine_transpose = current_input[mesh_index].model;
+    let model_affine_transpose = current_input[input_index].model;
     let model = maths::affine3_to_square(model_affine_transpose);
 
+    // Cull if necessary.
+#ifdef FRUSTUM_CULLING
+    let aabb_center = mesh_culling_data[input_index].aabb_center.xyz;
+    let aabb_half_extents = mesh_culling_data[input_index].aabb_half_extents.xyz;
+
+    // Do a quick sphere-based frustum cull.
+    let sphere_center = model * vec4(aabb_center, 1.0);
+    let sphere_radius = length(maths::mat4x4_to_mat3x3(model) * aabb_half_extents);
+    if (!view_frustum_intersects_sphere(sphere_center.xyz, sphere_radius)) {
+        return;
+    }
+
+    // Do the more expensive OBB-based frustum cull.
+    if (!view_frustum_intersects_obb(model, sphere_center, aabb_half_extents)) {
+        return;
+    }
+#endif
+
     // Calculate inverse transpose.
     let inverse_transpose_model = transpose(maths::inverse_affine3(transpose(
         model_affine_transpose)));
@@ -68,7 +182,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
     let inverse_transpose_model_b = inverse_transpose_model[2].z;
 
     // Look up the previous model matrix.
-    let previous_input_index = current_input[mesh_index].previous_input_index;
+    let previous_input_index = current_input[input_index].previous_input_index;
     var previous_model: mat3x4<f32>;
     if (previous_input_index == 0xffffffff) {
         previous_model = model_affine_transpose;
@@ -76,11 +190,21 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
         previous_model = previous_input[previous_input_index].model;
     }
 
+    // Figure out the output index. In indirect mode, this involves bumping the
+    // instance index in the indirect parameters structure. Otherwise, this
+    // index was directly supplied to us.
+#ifdef INDIRECT
+    let mesh_output_index = indirect_parameters[output_index].instance_index +
+        atomicAdd(&indirect_parameters[output_index].instance_count, 1u);
+#else
+    let mesh_output_index = output_index;
+#endif
+
     // Write the output.
-    output[output_index].model = model_affine_transpose;
-    output[output_index].previous_model = previous_model;
-    output[output_index].inverse_transpose_model_a = inverse_transpose_model_a;
-    output[output_index].inverse_transpose_model_b = inverse_transpose_model_b;
-    output[output_index].flags = current_input[mesh_index].flags;
-    output[output_index].lightmap_uv_rect = current_input[mesh_index].lightmap_uv_rect;
+    output[mesh_output_index].model = model_affine_transpose;
+    output[mesh_output_index].previous_model = previous_model;
+    output[mesh_output_index].inverse_transpose_model_a = inverse_transpose_model_a;
+    output[mesh_output_index].inverse_transpose_model_b = inverse_transpose_model_b;
+    output[mesh_output_index].flags = current_input[input_index].flags;
+    output[mesh_output_index].lightmap_uv_rect = current_input[input_index].lightmap_uv_rect;
 }
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 8757943419eb6f..c22a154cbeb1dd 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -1,30 +1,47 @@
 //! Batching functionality when GPU preprocessing is in use.
 
-use std::marker::PhantomData;
-
+use bevy_app::{App, Plugin};
+use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::{
     entity::Entity,
-    query::With,
+    query::{Has, With},
+    schedule::IntoSystemConfigs as _,
     system::{Query, Res, ResMut, Resource, StaticSystemParam},
 };
 use bevy_encase_derive::ShaderType;
 use bevy_utils::EntityHashMap;
 use bytemuck::{Pod, Zeroable};
+use nonmax::NonMaxU32;
 use smallvec::smallvec;
 use wgpu::{BindingResource, BufferUsages};
 
 use crate::{
     render_phase::{
         BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        SortedPhaseItem, SortedRenderPhase,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, UnbatchableBinnedEntityIndices,
     },
-    render_resource::{BufferVec, GpuArrayBufferIndex, GpuArrayBufferable, UninitBufferVec},
+    render_resource::{BufferVec, GpuArrayBufferable, UninitBufferVec},
     renderer::{RenderDevice, RenderQueue},
-    view::ViewTarget,
+    view::{GpuCulling, ViewTarget},
+    Render, RenderApp, RenderSet,
 };
 
-use super::GetFullBatchData;
+use super::{BatchMeta, GetBatchData, GetFullBatchData};
+
+pub struct BatchingPlugin;
+
+impl Plugin for BatchingPlugin {
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
 
+        render_app.add_systems(
+            Render,
+            write_indirect_parameters_buffer.in_set(RenderSet::PrepareResourcesFlush),
+        );
+    }
+}
 /// The GPU buffers holding the data needed to render batches.
 ///
 /// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
@@ -51,7 +68,7 @@ where
     /// corresponds to each instance.
     ///
     /// This is keyed off each view. Each view has a separate buffer.
-    pub work_item_buffers: EntityHashMap<Entity, BufferVec<PreprocessWorkItem>>,
+    pub work_item_buffers: EntityHashMap<Entity, PreprocessWorkItemBuffer>,
 
     /// The uniform data inputs for the current frame.
     ///
@@ -68,6 +85,14 @@ where
     pub previous_input_buffer: BufferVec<BDI>,
 }
 
+/// The buffer of GPU preprocessing work items for a single view.
+pub struct PreprocessWorkItemBuffer {
+    /// The buffer of work items.
+    pub buffer: BufferVec<PreprocessWorkItem>,
+    /// True if we're using GPU culling.
+    pub gpu_culling: bool,
+}
+
 /// One invocation of the preprocessing shader: i.e. one mesh instance in a
 /// view.
 #[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
@@ -76,10 +101,94 @@ pub struct PreprocessWorkItem {
     /// The index of the batch input data in the input buffer that the shader
     /// reads from.
     pub input_index: u32,
-    /// The index of the `MeshUniform` in the output buffer that we write to.
+    /// In direct mode, this is the index of the `MeshUniform` in the output
+    /// buffer that we write to. In indirect mode, this is the index of the
+    /// [`IndirectParameters`].
     pub output_index: u32,
 }
 
+/// The `wgpu` indirect parameters structure.
+///
+/// This is actually a union of the two following structures:
+///
+/// ```
+/// #[repr(C)]
+/// struct ArrayIndirectParameters {
+///     vertex_count: u32,
+///     instance_count: u32,
+///     first_vertex: u32,
+///     first_instance: u32,
+/// }
+///
+/// #[repr(C)]
+/// struct ElementIndirectParameters {
+///     index_count: u32,
+///     instance_count: u32,
+///     first_vertex: u32,
+///     base_vertex: u32,
+///     first_instance: u32,
+/// }
+/// ```
+///
+/// We actually generally treat these two variants identically in code. To do
+/// that, we make the following two observations:
+///
+/// 1. `instance_count` is in the same place in both structures. So we can
+/// access it regardless of the structure we're looking at.
+///
+/// 2. The second structure is one word larger than the first. Thus we need to
+/// pad out the first structure by one word in order to place both structures in
+/// an array. If we pad out `ArrayIndirectParameters` by copying the
+/// `first_instance` field into the padding, then the resulting union structure
+/// will always have a read-only copy of `first_instance` in the final word. We
+/// take advantage of this in the shader to eliminate branching.
+#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
+#[repr(C)]
+pub struct IndirectParameters {
+    /// For `ArrayIndirectParameters`, `vertex_count`; for
+    /// `ElementIndirectParameters`, `index_count`.
+    pub data0: u32,
+
+    /// The number of instances we're going to draw.
+    ///
+    /// This field is in the same place in both structures.
+    pub instance_count: u32,
+
+    /// The index of the first vertex we're to draw.
+    pub first_vertex: u32,
+
+    /// For `ArrayIndirectParameters`, `first_instance`; for
+    /// `ElementIndirectParameters`, `base_vertex`.`
+    pub data1: u32,
+
+    /// For `ArrayIndirectParameters`, this is padding; for
+    /// `ElementIndirectParameters`, this is `first_instance`.
+    ///
+    /// Conventionally, we copy `first_instance` into this field when padding
+    /// out `ArrayIndirectParameters`. That way, shader code can read this value
+    /// at the same place, regardless of the specific structure this represents.
+    pub first_instance: u32,
+}
+
+/// The buffer containing the list of [`IndirectParameters`], for draw commands.
+#[derive(Resource, Deref, DerefMut)]
+pub struct IndirectParametersBuffer(pub BufferVec<IndirectParameters>);
+
+impl IndirectParametersBuffer {
+    /// Creates the indirect parameters buffer.
+    pub fn new() -> IndirectParametersBuffer {
+        IndirectParametersBuffer(BufferVec::new(
+            BufferUsages::STORAGE | BufferUsages::INDIRECT,
+        ))
+    }
+}
+
+impl Default for IndirectParametersBuffer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
 where
     BD: GpuArrayBufferable + Sync + Send + 'static,
@@ -110,7 +219,7 @@ where
         self.current_input_buffer.clear();
         self.previous_input_buffer.clear();
         for work_item_buffer in self.work_item_buffers.values_mut() {
-            work_item_buffer.clear();
+            work_item_buffer.buffer.clear();
         }
     }
 }
@@ -125,6 +234,51 @@ where
     }
 }
 
+/// Information about a render batch that we're building up during a sorted
+/// render phase.
+struct SortedRenderBatch<F>
+where
+    F: GetBatchData,
+{
+    /// The index of the first phase item in this batch in the list of phase
+    /// items.
+    phase_item_start_index: u32,
+
+    /// The index of the first instance in this batch in the instance buffer.
+    instance_start_index: u32,
+
+    /// The index of the indirect parameters for this batch in the
+    /// [`IndirectParametersBuffer`].
+    ///
+    /// If CPU culling is being used, then this will be `None`.
+    indirect_parameters_index: Option<NonMaxU32>,
+
+    /// Metadata that can be used to determine whether an instance can be placed
+    /// into this batch.
+    meta: BatchMeta<F::CompareData>,
+}
+
+impl<F> SortedRenderBatch<F>
+where
+    F: GetBatchData,
+{
+    /// Finalizes this batch and updates the [`SortedRenderPhase`] with the
+    /// appropriate indices.
+    ///
+    /// `instance_end_index` is the index of the last instance in this batch
+    /// plus one.
+    fn flush<I>(self, instance_end_index: u32, phase: &mut SortedRenderPhase<I>)
+    where
+        I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    {
+        let (batch_range, batch_extra_index) =
+            phase.items[self.phase_item_start_index as usize].batch_range_and_extra_index_mut();
+        *batch_range = self.instance_start_index..instance_end_index;
+        *batch_extra_index =
+            PhaseItemExtraIndex::maybe_indirect_parameters_index(self.indirect_parameters_index);
+    }
+}
+
 /// A system that removes GPU preprocessing work item buffers that correspond to
 /// deleted [`ViewTarget`]s.
 ///
@@ -148,53 +302,123 @@ pub fn delete_old_work_item_buffers<GFBD>(
 /// is in use. This means comparing metadata needed to draw each phase item and
 /// trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
-    gpu_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-    >,
-    mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
-    param: StaticSystemParam<GFBD::Param>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
+    mut views: Query<(Entity, &mut SortedRenderPhase<I>, Has<GpuCulling>)>,
+    system_param_item: StaticSystemParam<GFBD::Param>,
 ) where
     I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
     GFBD: GetFullBatchData,
 {
-    let system_param_item = param.into_inner();
-
     // We only process GPU-built batch data in this function.
     let BatchedInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
         ..
-    } = gpu_batched_instance_buffers.into_inner();
+    } = gpu_array_buffer.into_inner();
 
-    for (view, mut phase) in &mut views {
+    for (view, mut phase, gpu_culling) in &mut views {
         // Create the work item buffer if necessary.
-        let work_item_buffer = work_item_buffers
-            .entry(view)
-            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+        let work_item_buffer =
+            work_item_buffers
+                .entry(view)
+                .or_insert_with(|| PreprocessWorkItemBuffer {
+                    buffer: BufferVec::new(BufferUsages::STORAGE),
+                    gpu_culling,
+                });
 
-        super::batch_and_prepare_sorted_render_phase::<I, GFBD>(&mut phase, |item| {
-            let (input_index, compare_data) =
-                GFBD::get_index_and_compare_data(&system_param_item, item.entity())?;
-            let output_index = data_buffer.add() as u32;
+        // Walk through the list of phase items, building up batches as we go.
+        let mut batch: Option<SortedRenderBatch<GFBD>> = None;
+        for current_index in 0..phase.items.len() {
+            // Get the index of the input data, and comparison metadata, for
+            // this entity.
+            let current_batch_input_index = GFBD::get_index_and_compare_data(
+                &system_param_item,
+                phase.items[current_index].entity(),
+            );
+
+            // Unpack that index and metadata.
+            let (current_input_index, current_meta);
+            match current_batch_input_index {
+                Some((input_index, Some(current_compare_data))) => {
+                    current_input_index = Some(input_index);
+                    current_meta = Some(BatchMeta::new(
+                        &phase.items[current_index],
+                        current_compare_data,
+                    ));
+                }
+                _ => {
+                    current_input_index = None;
+                    current_meta = None;
+                }
+            }
 
-            work_item_buffer.push(PreprocessWorkItem {
-                input_index: input_index.into(),
-                output_index,
+            // Determine if this entity can be included in the batch we're
+            // building up.
+            let can_batch = batch.as_ref().is_some_and(|batch| {
+                current_meta
+                    .as_ref()
+                    .is_some_and(|current_meta| batch.meta == *current_meta)
             });
 
-            *item.batch_range_mut() = output_index..output_index + 1;
+            // Make space in the data buffer for this instance.
+            let current_entity = phase.items[current_index].entity();
+            let output_index = data_buffer.add() as u32;
+
+            // If we can't batch, break the existing batch and make a new one.
+            if !can_batch {
+                // Break a batch if we need to.
+                if let Some(batch) = batch.take() {
+                    batch.flush(output_index, &mut phase);
+                }
+
+                // Start a new batch.
+                batch = current_meta.map(|meta| {
+                    let indirect_parameters_index = if gpu_culling {
+                        GFBD::get_batch_indirect_parameters_index(
+                            &system_param_item,
+                            &mut indirect_parameters_buffer,
+                            current_entity,
+                            output_index,
+                        )
+                    } else {
+                        None
+                    };
+                    SortedRenderBatch {
+                        phase_item_start_index: current_index as u32,
+                        instance_start_index: output_index,
+                        indirect_parameters_index,
+                        meta,
+                    }
+                });
+            }
+
+            // Add a new preprocessing work item so that the preprocessing
+            // shader will copy the per-instance data over.
+            if let (Some(batch), Some(input_index)) = (batch.as_ref(), current_input_index.as_ref())
+            {
+                work_item_buffer.buffer.push(PreprocessWorkItem {
+                    input_index: (*input_index).into(),
+                    output_index: match batch.indirect_parameters_index {
+                        Some(indirect_parameters_index) => indirect_parameters_index.into(),
+                        None => output_index,
+                    },
+                });
+            }
+        }
 
-            compare_data
-        });
+        // Flush the final batch if necessary.
+        if let Some(batch) = batch.take() {
+            batch.flush(data_buffer.len() as u32, &mut phase);
+        }
     }
 }
 
 /// Creates batches for a render phase that uses bins.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    gpu_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-    >,
-    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
+    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>, Has<GpuCulling>)>,
     param: StaticSystemParam<GFBD::Param>,
 ) where
     BPI: BinnedPhaseItem,
@@ -206,16 +430,20 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
         ref mut data_buffer,
         ref mut work_item_buffers,
         ..
-    } = gpu_batched_instance_buffers.into_inner();
+    } = gpu_array_buffer.into_inner();
 
-    for (view, mut phase) in &mut views {
+    for (view, mut phase, gpu_culling) in &mut views {
         let phase = &mut *phase; // Borrow checker.
 
         // Create the work item buffer if necessary; otherwise, just mark it as
         // used this frame.
-        let work_item_buffer = work_item_buffers
-            .entry(view)
-            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+        let work_item_buffer =
+            work_item_buffers
+                .entry(view)
+                .or_insert_with(|| PreprocessWorkItemBuffer {
+                    buffer: BufferVec::new(BufferUsages::STORAGE),
+                    gpu_culling,
+                });
 
         // Prepare batchables.
 
@@ -227,19 +455,50 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 };
                 let output_index = data_buffer.add() as u32;
 
-                work_item_buffer.push(PreprocessWorkItem {
-                    input_index: input_index.into(),
-                    output_index,
-                });
-
-                batch
-                    .get_or_insert(BinnedRenderPhaseBatch {
-                        representative_entity: entity,
-                        instance_range: output_index..output_index,
-                        dynamic_offset: None,
-                    })
-                    .instance_range
-                    .end = output_index + 1;
+                match batch {
+                    Some(ref mut batch) => {
+                        batch.instance_range.end = output_index + 1;
+                        work_item_buffer.buffer.push(PreprocessWorkItem {
+                            input_index: input_index.into(),
+                            output_index: batch
+                                .extra_index
+                                .as_indirect_parameters_index()
+                                .unwrap_or(output_index),
+                        });
+                    }
+
+                    None if gpu_culling => {
+                        let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
+                            &system_param_item,
+                            &mut indirect_parameters_buffer,
+                            entity,
+                            output_index,
+                        );
+                        work_item_buffer.buffer.push(PreprocessWorkItem {
+                            input_index: input_index.into(),
+                            output_index: indirect_parameters_index.unwrap_or_default().into(),
+                        });
+                        batch = Some(BinnedRenderPhaseBatch {
+                            representative_entity: entity,
+                            instance_range: output_index..output_index + 1,
+                            extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(
+                                indirect_parameters_index,
+                            ),
+                        });
+                    }
+
+                    None => {
+                        work_item_buffer.buffer.push(PreprocessWorkItem {
+                            input_index: input_index.into(),
+                            output_index,
+                        });
+                        batch = Some(BinnedRenderPhaseBatch {
+                            representative_entity: entity,
+                            instance_range: output_index..output_index + 1,
+                            extra_index: PhaseItemExtraIndex::NONE,
+                        });
+                    }
+                }
             }
 
             if let Some(batch) = batch {
@@ -256,18 +515,38 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 };
                 let output_index = data_buffer.add() as u32;
 
-                work_item_buffer.push(PreprocessWorkItem {
-                    input_index: input_index.into(),
-                    output_index,
-                });
-
-                unbatchables
-                    .buffer_indices
-                    .add(GpuArrayBufferIndex::<GFBD::BufferData> {
-                        index: output_index,
-                        dynamic_offset: None,
-                        element_type: PhantomData,
+                if gpu_culling {
+                    let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
+                        &system_param_item,
+                        &mut indirect_parameters_buffer,
+                        entity,
+                        output_index,
+                    )
+                    .unwrap_or_default();
+                    work_item_buffer.buffer.push(PreprocessWorkItem {
+                        input_index: input_index.into(),
+                        output_index: indirect_parameters_index.into(),
+                    });
+                    unbatchables
+                        .buffer_indices
+                        .add(UnbatchableBinnedEntityIndices {
+                            instance_index: indirect_parameters_index.into(),
+                            extra_index: PhaseItemExtraIndex::indirect_parameters_index(
+                                indirect_parameters_index.into(),
+                            ),
+                        });
+                } else {
+                    work_item_buffer.buffer.push(PreprocessWorkItem {
+                        input_index: input_index.into(),
+                        output_index,
                     });
+                    unbatchables
+                        .buffer_indices
+                        .add(UnbatchableBinnedEntityIndices {
+                            instance_index: output_index,
+                            extra_index: PhaseItemExtraIndex::NONE,
+                        });
+                }
             }
         }
     }
@@ -277,23 +556,26 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 pub fn write_batched_instance_buffers<GFBD>(
     render_device: Res<RenderDevice>,
     render_queue: Res<RenderQueue>,
-    mut gpu_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-    >,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
 ) where
     GFBD: GetFullBatchData,
 {
-    gpu_batched_instance_buffers
-        .data_buffer
-        .write_buffer(&render_device);
-    gpu_batched_instance_buffers
-        .current_input_buffer
-        .write_buffer(&render_device, &render_queue);
+    let BatchedInstanceBuffers {
+        ref mut data_buffer,
+        work_item_buffers: ref mut index_buffers,
+        ref mut current_input_buffer,
+        previous_input_buffer: _,
+    } = gpu_array_buffer.into_inner();
+
+    data_buffer.write_buffer(&render_device);
+    current_input_buffer.write_buffer(&render_device, &render_queue);
     // There's no need to write `previous_input_buffer`, as we wrote
     // that on the previous frame, and it hasn't changed.
 
-    for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() {
-        work_item_buffer.write_buffer(&render_device, &render_queue);
+    for index_buffer in index_buffers.values_mut() {
+        index_buffer
+            .buffer
+            .write_buffer(&render_device, &render_queue);
     }
 }
 
@@ -303,3 +585,12 @@ pub fn write_batched_instance_buffers<GFBD>(
 pub fn can_preprocess_on_gpu(render_device: &RenderDevice) -> bool {
     render_device.limits().max_compute_workgroup_size_x > 0
 }
+
+pub fn write_indirect_parameters_buffer(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
+) {
+    indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
+    indirect_parameters_buffer.clear();
+}
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 6811451f37dbd0..32ac134441bc45 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -14,6 +14,8 @@ use crate::{
     render_resource::{CachedRenderPipelineId, GpuArrayBufferable},
 };
 
+use self::gpu_preprocessing::IndirectParametersBuffer;
+
 pub mod gpu_preprocessing;
 pub mod no_gpu_preprocessing;
 
@@ -52,7 +54,7 @@ impl<T: PartialEq> BatchMeta<T> {
         BatchMeta {
             pipeline_id: item.cached_pipeline(),
             draw_function_id: item.draw_function(),
-            dynamic_offset: item.dynamic_offset(),
+            dynamic_offset: item.extra_index().as_dynamic_offset(),
             user_data,
         }
     }
@@ -133,6 +135,19 @@ pub trait GetFullBatchData: GetBatchData {
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<NonMaxU32>;
+
+    /// Pushes [`gpu_preprocessing::IndirectParameters`] necessary to draw this
+    /// batch onto the given [`IndirectParametersBuffer`], and returns its
+    /// index.
+    ///
+    /// This is only used if GPU culling is enabled (which requires GPU
+    /// preprocessing).
+    fn get_batch_indirect_parameters_index(
+        param: &SystemParamItem<Self::Param>,
+        indirect_parameters_buffer: &mut IndirectParametersBuffer,
+        entity: Entity,
+        instance_index: u32,
+    ) -> Option<NonMaxU32>;
 }
 
 /// A system that runs early in extraction and clears out all the
@@ -151,6 +166,7 @@ pub fn clear_batched_instance_buffers<GFBD>(
 ) where
     GFBD: GetFullBatchData,
 {
+    // Clear out the CPU-batched instance buffers, if present.
     if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
         cpu_batched_instance_buffer.clear();
     }
diff --git a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
index 429fe5bb4542e7..01ed925fe9e40e 100644
--- a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
@@ -8,7 +8,7 @@ use wgpu::BindingResource;
 use crate::{
     render_phase::{
         BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        SortedPhaseItem, SortedRenderPhase,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
     },
     render_resource::{GpuArrayBuffer, GpuArrayBufferable},
     renderer::{RenderDevice, RenderQueue},
@@ -66,8 +66,9 @@ pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
             let buffer_index = batched_instance_buffer.push(buffer_data);
 
             let index = buffer_index.index;
-            *item.batch_range_mut() = index..index + 1;
-            *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
+            let (batch_range, extra_index) = item.batch_range_and_extra_index_mut();
+            *batch_range = index..index + 1;
+            *extra_index = PhaseItemExtraIndex::maybe_dynamic_offset(buffer_index.dynamic_offset);
 
             compare_data
         });
@@ -77,13 +78,14 @@ pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
 /// Creates batches for a render phase that uses bins, when GPU batch data
 /// building isn't in use.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    mut buffer: ResMut<BatchedInstanceBuffer<GFBD::BufferData>>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffer<GFBD::BufferData>>,
     mut views: Query<&mut BinnedRenderPhase<BPI>>,
     param: StaticSystemParam<GFBD::Param>,
 ) where
     BPI: BinnedPhaseItem,
     GFBD: GetFullBatchData,
 {
+    let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
     for mut phase in &mut views {
@@ -98,7 +100,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 else {
                     continue;
                 };
-                let instance = buffer.push(buffer_data);
+                let instance = gpu_array_buffer.push(buffer_data);
 
                 // If the dynamic offset has changed, flush the batch.
                 //
@@ -107,12 +109,15 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 // with no storage buffers.
                 if !batch_set.last().is_some_and(|batch| {
                     batch.instance_range.end == instance.index
-                        && batch.dynamic_offset == instance.dynamic_offset
+                        && batch.extra_index
+                            == PhaseItemExtraIndex::maybe_dynamic_offset(instance.dynamic_offset)
                 }) {
                     batch_set.push(BinnedRenderPhaseBatch {
                         representative_entity: entity,
                         instance_range: instance.index..instance.index,
-                        dynamic_offset: instance.dynamic_offset,
+                        extra_index: PhaseItemExtraIndex::maybe_dynamic_offset(
+                            instance.dynamic_offset,
+                        ),
                     });
                 }
 
@@ -132,8 +137,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 else {
                     continue;
                 };
-                let instance = buffer.push(buffer_data);
-                unbatchables.buffer_indices.add(instance);
+                let instance = gpu_array_buffer.push(buffer_data);
+                unbatchables.buffer_indices.add(instance.into());
             }
         }
     }
diff --git a/crates/bevy_render/src/camera/camera.rs b/crates/bevy_render/src/camera/camera.rs
index 4d25ada7859f35..9c5211cb301fcb 100644
--- a/crates/bevy_render/src/camera/camera.rs
+++ b/crates/bevy_render/src/camera/camera.rs
@@ -6,7 +6,7 @@ use crate::{
     render_graph::{InternedRenderSubGraph, RenderSubGraph},
     render_resource::TextureView,
     texture::GpuImage,
-    view::{ColorGrading, ExtractedView, ExtractedWindows, RenderLayers, VisibleEntities},
+    view::{ColorGrading, ExtractedView, ExtractedWindows, GpuCulling, RenderLayers, VisibleEntities},
     Extract,
 };
 use bevy_asset::{AssetEvent, AssetId, Assets, Handle};
@@ -17,6 +17,7 @@ use bevy_ecs::{
     entity::Entity,
     event::EventReader,
     prelude::With,
+    query::Has,
     reflect::ReflectComponent,
     system::{Commands, Query, Res, ResMut, Resource},
 };
@@ -827,6 +828,7 @@ pub fn extract_cameras(
             Option<&TemporalJitter>,
             Option<&RenderLayers>,
             Option<&Projection>,
+            Has<GpuCulling>,
         )>,
     >,
     primary_window: Extract<Query<Entity, With<PrimaryWindow>>>,
@@ -844,6 +846,7 @@ pub fn extract_cameras(
         temporal_jitter,
         render_layers,
         projection,
+        gpu_culling,
     ) in query.iter()
     {
         let color_grading = *color_grading.unwrap_or(&ColorGrading::default());
@@ -915,6 +918,10 @@ pub fn extract_cameras(
             if let Some(perspective) = projection {
                 commands.insert(perspective.clone());
             }
+
+            if gpu_culling {
+                commands.insert(GpuCulling);
+            }
         }
     }
 }
diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs
index cfbba600f0c05b..819c81d276755e 100644
--- a/crates/bevy_render/src/lib.rs
+++ b/crates/bevy_render/src/lib.rs
@@ -52,6 +52,7 @@ pub mod prelude {
     };
 }
 
+use batching::gpu_preprocessing::BatchingPlugin;
 use bevy_ecs::schedule::ScheduleBuildSettings;
 use bevy_utils::prelude::default;
 pub use extract_param::Extract;
@@ -332,6 +333,7 @@ impl Plugin for RenderPlugin {
             MeshPlugin,
             GlobalsPlugin,
             MorphPlugin,
+            BatchingPlugin,
         ));
 
         app.register_type::<alpha::AlphaMode>()
diff --git a/crates/bevy_render/src/maths.wgsl b/crates/bevy_render/src/maths.wgsl
index 4070a8679a5b88..eb672b00f0afab 100644
--- a/crates/bevy_render/src/maths.wgsl
+++ b/crates/bevy_render/src/maths.wgsl
@@ -52,6 +52,11 @@ fn inverse_affine3(affine: mat4x3<f32>) -> mat4x3<f32> {
     return mat4x3<f32>(inv_matrix3[0], inv_matrix3[1], inv_matrix3[2], -(inv_matrix3 * affine[3]));
 }
 
+// Extracts the upper 3x3 portion of a 4x4 matrix.
+fn mat4x4_to_mat3x3(m: mat4x4<f32>) -> mat3x3<f32> {
+    return mat3x3<f32>(m[0].xyz, m[1].xyz, m[2].xyz);
+}
+
 // Creates an orthonormal basis given a Z vector and an up vector (which becomes
 // Y after orthonormalization).
 //
@@ -64,3 +69,16 @@ fn orthonormalize(z_unnormalized: vec3<f32>, up: vec3<f32>) -> mat3x3<f32> {
     let y_basis = cross(z_basis, x_basis);
     return mat3x3(x_basis, y_basis, z_basis);
 }
+
+// Returns true if any part of a sphere is on the positive side of a plane.
+//
+// `sphere_center.w` should be 1.0.
+//
+// This is used for frustum culling.
+fn sphere_is_inside_frustum_plane(
+    frustum_plane: vec4<f32>,
+    sphere_center: vec4<f32>,
+    sphere_radius: f32
+) -> bool {
+    return dot(frustum_plane, sphere_center) + sphere_radius > 0.0;
+}
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index 58e1fa550f4e2a..3961b306f300ed 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -51,7 +51,14 @@ use bevy_ecs::{
     system::{lifetimeless::SRes, SystemParamItem},
 };
 use smallvec::SmallVec;
-use std::{hash::Hash, marker::PhantomData, ops::Range, slice::SliceIndex};
+use std::{
+    fmt::{self, Debug, Formatter},
+    hash::Hash,
+    iter,
+    marker::PhantomData,
+    ops::Range,
+    slice::SliceIndex,
+};
 
 /// A collection of all rendering instructions, that will be executed by the GPU, for a
 /// single render phase for a single view.
@@ -124,7 +131,7 @@ pub struct BinnedRenderPhaseBatch {
     ///
     /// Note that dynamic offsets are only used on platforms that don't support
     /// storage buffers.
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 /// Information about the unbatchable entities in a bin.
@@ -133,7 +140,7 @@ pub(crate) struct UnbatchableBinnedEntities {
     pub(crate) entities: Vec<Entity>,
 
     /// The GPU array buffer indices of each unbatchable binned entity.
-    pub(crate) buffer_indices: UnbatchableBinnedEntityBufferIndex,
+    pub(crate) buffer_indices: UnbatchableBinnedEntityIndexSet,
 }
 
 /// Stores instance indices and dynamic offsets for unbatchable entities in a
@@ -145,7 +152,7 @@ pub(crate) struct UnbatchableBinnedEntities {
 /// platforms that aren't WebGL 2.
 #[derive(Default)]
 
-pub(crate) enum UnbatchableBinnedEntityBufferIndex {
+pub(crate) enum UnbatchableBinnedEntityIndexSet {
     /// There are no unbatchable entities in this bin (yet).
     #[default]
     NoEntities,
@@ -155,26 +162,42 @@ pub(crate) enum UnbatchableBinnedEntityBufferIndex {
     ///
     /// This is the typical case on platforms other than WebGL 2. We special
     /// case this to avoid allocation on those platforms.
-    NoDynamicOffsets {
+    Sparse {
         /// The range of indices.
         instance_range: Range<u32>,
+        /// The index of the first indirect instance parameters.
+        ///
+        /// The other indices immediately follow these.
+        first_indirect_parameters_index: Option<NonMaxU32>,
     },
 
     /// Dynamic uniforms are present for unbatchable entities in this bin.
     ///
     /// We fall back to this on WebGL 2.
-    DynamicOffsets(Vec<UnbatchableBinnedEntityDynamicOffset>),
+    Dense(Vec<UnbatchableBinnedEntityIndices>),
 }
 
 /// The instance index and dynamic offset (if present) for an unbatchable entity.
 ///
 /// This is only useful on platforms that don't support storage buffers.
 #[derive(Clone, Copy)]
-pub(crate) struct UnbatchableBinnedEntityDynamicOffset {
+pub(crate) struct UnbatchableBinnedEntityIndices {
     /// The instance index.
-    instance_index: u32,
-    /// The dynamic offset, if present.
-    dynamic_offset: Option<NonMaxU32>,
+    pub(crate) instance_index: u32,
+    /// The [`PhaseItemExtraIndex`], if present.
+    pub(crate) extra_index: PhaseItemExtraIndex,
+}
+
+impl<T> From<GpuArrayBufferIndex<T>> for UnbatchableBinnedEntityIndices
+where
+    T: Clone + ShaderSize + WriteInto,
+{
+    fn from(value: GpuArrayBufferIndex<T>) -> Self {
+        UnbatchableBinnedEntityIndices {
+            instance_index: value.index,
+            extra_index: PhaseItemExtraIndex::maybe_dynamic_offset(value.dynamic_offset),
+        }
+    }
 }
 
 impl<BPI> BinnedRenderPhase<BPI>
@@ -227,7 +250,7 @@ where
                     key.clone(),
                     batch.representative_entity,
                     batch.instance_range.clone(),
-                    batch.dynamic_offset,
+                    batch.extra_index,
                 );
 
                 // Fetch the draw function.
@@ -246,17 +269,26 @@ where
             let unbatchable_entities = &self.unbatchable_values[key];
             for (entity_index, &entity) in unbatchable_entities.entities.iter().enumerate() {
                 let unbatchable_dynamic_offset = match &unbatchable_entities.buffer_indices {
-                    UnbatchableBinnedEntityBufferIndex::NoEntities => {
+                    UnbatchableBinnedEntityIndexSet::NoEntities => {
                         // Shouldn't happen…
                         continue;
                     }
-                    UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets { instance_range } => {
-                        UnbatchableBinnedEntityDynamicOffset {
-                            instance_index: instance_range.start + entity_index as u32,
-                            dynamic_offset: None,
-                        }
-                    }
-                    UnbatchableBinnedEntityBufferIndex::DynamicOffsets(ref dynamic_offsets) => {
+                    UnbatchableBinnedEntityIndexSet::Sparse {
+                        instance_range,
+                        first_indirect_parameters_index,
+                    } => UnbatchableBinnedEntityIndices {
+                        instance_index: instance_range.start + entity_index as u32,
+                        extra_index: match first_indirect_parameters_index {
+                            None => PhaseItemExtraIndex::NONE,
+                            Some(first_indirect_parameters_index) => {
+                                PhaseItemExtraIndex::indirect_parameters_index(
+                                    u32::from(*first_indirect_parameters_index)
+                                        + entity_index as u32,
+                                )
+                            }
+                        },
+                    },
+                    UnbatchableBinnedEntityIndexSet::Dense(ref dynamic_offsets) => {
                         dynamic_offsets[entity_index]
                     }
                 };
@@ -266,7 +298,7 @@ where
                     entity,
                     unbatchable_dynamic_offset.instance_index
                         ..(unbatchable_dynamic_offset.instance_index + 1),
-                    unbatchable_dynamic_offset.dynamic_offset,
+                    unbatchable_dynamic_offset.extra_index,
                 );
 
                 // Fetch the draw function.
@@ -300,6 +332,41 @@ where
     }
 }
 
+impl UnbatchableBinnedEntityIndexSet {
+    /// Returns the [`UnbatchableBinnedEntityIndices`] for the given entity.
+    fn indices_for_entity_index(
+        &self,
+        entity_index: u32,
+    ) -> Option<UnbatchableBinnedEntityIndices> {
+        match self {
+            UnbatchableBinnedEntityIndexSet::NoEntities => None,
+            UnbatchableBinnedEntityIndexSet::Sparse {
+                instance_range,
+                first_indirect_parameters_index,
+            } => {
+                if entity_index >= instance_range.len() as u32 {
+                    None
+                } else {
+                    Some(UnbatchableBinnedEntityIndices {
+                        instance_index: instance_range.start + entity_index,
+                        extra_index: match first_indirect_parameters_index {
+                            None => PhaseItemExtraIndex::NONE,
+                            Some(first_indirect_parameters_index) => {
+                                PhaseItemExtraIndex::indirect_parameters_index(
+                                    u32::from(*first_indirect_parameters_index) + entity_index,
+                                )
+                            }
+                        },
+                    })
+                }
+            }
+            UnbatchableBinnedEntityIndexSet::Dense(ref indices) => {
+                indices.get(entity_index as usize).copied()
+            }
+        }
+    }
+}
+
 /// A convenient abstraction for adding all the systems necessary for a binned
 /// render phase to the render app.
 ///
@@ -395,74 +462,62 @@ where
     }
 }
 
-impl UnbatchableBinnedEntityBufferIndex {
+impl UnbatchableBinnedEntityIndexSet {
     /// Adds a new entity to the list of unbatchable binned entities.
-    pub fn add<T>(&mut self, gpu_array_buffer_index: GpuArrayBufferIndex<T>)
-    where
-        T: ShaderSize + WriteInto + Clone,
-    {
-        match (&mut *self, gpu_array_buffer_index.dynamic_offset) {
-            (UnbatchableBinnedEntityBufferIndex::NoEntities, None) => {
-                // This is the first entity we've seen, and we're not on WebGL
-                // 2. Initialize the fast path.
-                *self = UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets {
-                    instance_range: gpu_array_buffer_index.index
-                        ..(gpu_array_buffer_index.index + 1),
+    pub fn add(&mut self, indices: UnbatchableBinnedEntityIndices) {
+        match self {
+            UnbatchableBinnedEntityIndexSet::NoEntities => {
+                if indices.extra_index.is_dynamic_offset() {
+                    // This is the first entity we've seen, and we're on WebGL 2.
+                    // Initialize an array.
+                    *self = UnbatchableBinnedEntityIndexSet::Dense(vec![indices]);
+                } else {
+                    // This is the first entity we've seen, and we're not on WebGL
+                    // 2. Initialize the fast path.
+                    *self = UnbatchableBinnedEntityIndexSet::Sparse {
+                        instance_range: indices.instance_index..indices.instance_index + 1,
+                        first_indirect_parameters_index: indices
+                            .extra_index
+                            .as_indirect_parameters_index()
+                            .and_then(|index| NonMaxU32::try_from(index).ok()),
+                    }
                 }
             }
 
-            (UnbatchableBinnedEntityBufferIndex::NoEntities, Some(dynamic_offset)) => {
-                // This is the first entity we've seen, and we're on WebGL 2.
-                // Initialize an array.
-                *self = UnbatchableBinnedEntityBufferIndex::DynamicOffsets(vec![
-                    UnbatchableBinnedEntityDynamicOffset {
-                        instance_index: gpu_array_buffer_index.index,
-                        dynamic_offset: Some(dynamic_offset),
-                    },
-                ]);
-            }
-
-            (
-                UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets {
-                    ref mut instance_range,
-                },
-                None,
-            ) if instance_range.end == gpu_array_buffer_index.index => {
+            UnbatchableBinnedEntityIndexSet::Sparse {
+                ref mut instance_range,
+                first_indirect_parameters_index,
+            } if instance_range.end == indices.instance_index
+                && ((first_indirect_parameters_index.is_none()
+                    && indices.extra_index == PhaseItemExtraIndex::NONE)
+                    || first_indirect_parameters_index.is_some_and(
+                        |first_indirect_parameters_index| {
+                            Some(
+                                u32::from(first_indirect_parameters_index) + instance_range.end
+                                    - instance_range.start,
+                            ) == indices.extra_index.as_indirect_parameters_index()
+                        },
+                    )) =>
+            {
                 // This is the normal case on non-WebGL 2.
                 instance_range.end += 1;
             }
 
-            (
-                UnbatchableBinnedEntityBufferIndex::DynamicOffsets(ref mut offsets),
-                dynamic_offset,
-            ) => {
-                // This is the normal case on WebGL 2.
-                offsets.push(UnbatchableBinnedEntityDynamicOffset {
-                    instance_index: gpu_array_buffer_index.index,
-                    dynamic_offset,
-                });
-            }
-
-            (
-                UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets { instance_range },
-                dynamic_offset,
-            ) => {
+            UnbatchableBinnedEntityIndexSet::Sparse { instance_range, .. } => {
                 // We thought we were in non-WebGL 2 mode, but we got a dynamic
                 // offset or non-contiguous index anyway. This shouldn't happen,
                 // but let's go ahead and do the sensible thing anyhow: demote
                 // the compressed `NoDynamicOffsets` field to the full
                 // `DynamicOffsets` array.
-                let mut new_dynamic_offsets: Vec<_> = instance_range
-                    .map(|instance_index| UnbatchableBinnedEntityDynamicOffset {
-                        instance_index,
-                        dynamic_offset: None,
-                    })
+                let new_dynamic_offsets = (0..instance_range.len() as u32)
+                    .flat_map(|entity_index| self.indices_for_entity_index(entity_index))
+                    .chain(iter::once(indices))
                     .collect();
-                new_dynamic_offsets.push(UnbatchableBinnedEntityDynamicOffset {
-                    instance_index: gpu_array_buffer_index.index,
-                    dynamic_offset,
-                });
-                *self = UnbatchableBinnedEntityBufferIndex::DynamicOffsets(new_dynamic_offsets);
+                *self = UnbatchableBinnedEntityIndexSet::Dense(new_dynamic_offsets);
+            }
+
+            UnbatchableBinnedEntityIndexSet::Dense(ref mut dense_indices) => {
+                dense_indices.push(indices);
             }
         }
     }
@@ -604,8 +659,130 @@ pub trait PhaseItem: Sized + Send + Sync + 'static {
     fn batch_range(&self) -> &Range<u32>;
     fn batch_range_mut(&mut self) -> &mut Range<u32>;
 
-    fn dynamic_offset(&self) -> Option<NonMaxU32>;
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32>;
+    /// Returns the [`PhaseItemExtraIndex`].
+    ///
+    /// If present, this is either a dynamic offset or an indirect parameters
+    /// index.
+    fn extra_index(&self) -> PhaseItemExtraIndex;
+
+    /// Returns a pair of mutable references to both the batch range and extra
+    /// index.
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex);
+}
+
+/// The "extra index" associated with some [`PhaseItem`]s, alongside the
+/// indirect instance index.
+///
+/// Sometimes phase items require another index in addition to the range of
+/// instances they already have. These can be:
+///
+/// * The *dynamic offset*: a `wgpu` dynamic offset into the uniform buffer of
+/// instance data. This is used on platforms that don't support storage
+/// buffers, to work around uniform buffer size limitations.
+///
+/// * The *indirect parameters index*: an index into the buffer that specifies
+/// the indirect parameters for this [`PhaseItem`]'s drawcall. This is used when
+/// indirect mode is on (as used for GPU culling).
+///
+/// Note that our indirect draw functionality requires storage buffers, so it's
+/// impossible to have both a dynamic offset and an indirect parameters index.
+/// This convenient fact allows us to pack both indices into a single `u32`.
+#[derive(Clone, Copy, PartialEq, Eq, Hash)]
+pub struct PhaseItemExtraIndex(pub u32);
+
+impl Debug for PhaseItemExtraIndex {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        if self.is_dynamic_offset() {
+            write!(f, "DynamicOffset({})", self.offset())
+        } else if self.is_indirect_parameters_index() {
+            write!(f, "IndirectParametersIndex({})", self.offset())
+        } else {
+            write!(f, "None")
+        }
+    }
+}
+
+impl PhaseItemExtraIndex {
+    /// The flag that indicates that this index is an indirect parameter. If not
+    /// set, this is a dynamic offset.
+    pub const INDIRECT_PARAMETER_INDEX: u32 = 1 << 31;
+    /// To extract the index from a packed [`PhaseItemExtraIndex`], bitwise-and
+    /// the contents with this value.
+    pub const OFFSET_MASK: u32 = Self::INDIRECT_PARAMETER_INDEX - 1;
+    /// To extract the flag from a packed [`PhaseItemExtraIndex`], bitwise-and
+    /// the contents with this value.
+    pub const FLAGS_MASK: u32 = !Self::OFFSET_MASK;
+
+    /// The special value that indicates that no extra index is present.
+    pub const NONE: PhaseItemExtraIndex = PhaseItemExtraIndex(u32::MAX);
+
+    #[inline]
+    fn offset(&self) -> u32 {
+        self.0 & Self::OFFSET_MASK
+    }
+
+    #[inline]
+    fn is_dynamic_offset(&self) -> bool {
+        *self != Self::NONE && (self.0 & Self::INDIRECT_PARAMETER_INDEX) == 0
+    }
+
+    #[inline]
+    fn is_indirect_parameters_index(&self) -> bool {
+        *self != Self::NONE && (self.0 & Self::INDIRECT_PARAMETER_INDEX) != 0
+    }
+
+    #[inline]
+    pub fn indirect_parameters_index(indirect_parameter_index: u32) -> PhaseItemExtraIndex {
+        // Make sure we didn't overflow.
+        debug_assert_eq!(indirect_parameter_index & Self::FLAGS_MASK, 0);
+        PhaseItemExtraIndex(indirect_parameter_index | Self::INDIRECT_PARAMETER_INDEX)
+    }
+
+    #[inline]
+    pub fn maybe_indirect_parameters_index(
+        maybe_indirect_parameters_index: Option<NonMaxU32>,
+    ) -> PhaseItemExtraIndex {
+        match maybe_indirect_parameters_index {
+            Some(indirect_parameters_index) => {
+                Self::indirect_parameters_index(indirect_parameters_index.into())
+            }
+            None => PhaseItemExtraIndex::NONE,
+        }
+    }
+
+    #[inline]
+    pub fn dynamic_offset(dynamic_offset: u32) -> PhaseItemExtraIndex {
+        // Make sure we didn't overflow.
+        debug_assert_eq!(dynamic_offset & Self::FLAGS_MASK, 0);
+
+        PhaseItemExtraIndex(dynamic_offset)
+    }
+
+    #[inline]
+    pub fn maybe_dynamic_offset(maybe_dynamic_offset: Option<NonMaxU32>) -> PhaseItemExtraIndex {
+        match maybe_dynamic_offset {
+            Some(dynamic_offset) => Self::dynamic_offset(dynamic_offset.into()),
+            None => PhaseItemExtraIndex::NONE,
+        }
+    }
+
+    #[inline]
+    pub fn as_dynamic_offset(&self) -> Option<NonMaxU32> {
+        if self.is_dynamic_offset() {
+            NonMaxU32::try_from(self.0 & Self::OFFSET_MASK).ok()
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    pub fn as_indirect_parameters_index(&self) -> Option<u32> {
+        if self.is_indirect_parameters_index() {
+            Some(self.0 & Self::OFFSET_MASK)
+        } else {
+            None
+        }
+    }
 }
 
 /// Represents phase items that are placed into bins. The `BinKey` specifies
@@ -633,7 +810,7 @@ pub trait BinnedPhaseItem: PhaseItem {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self;
 }
 
diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs
index 8a0f77daafb3dc..7ff60c1f440786 100644
--- a/crates/bevy_render/src/render_resource/buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/buffer_vec.rs
@@ -228,6 +228,11 @@ where
         self.len = 0;
     }
 
+    /// Returns the length of the buffer.
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
     /// Materializes the buffer on the GPU with space for `capacity` elements.
     ///
     /// If the buffer is already big enough, this function doesn't reallocate
diff --git a/crates/bevy_render/src/view/mod.rs b/crates/bevy_render/src/view/mod.rs
index 1fef36c5844a05..bdeff34b0d4078 100644
--- a/crates/bevy_render/src/view/mod.rs
+++ b/crates/bevy_render/src/view/mod.rs
@@ -208,6 +208,12 @@ pub struct PostProcessWrite<'a> {
     pub destination: &'a TextureView,
 }
 
+#[derive(Component)]
+pub struct GpuCulling;
+
+#[derive(Component)]
+pub struct NoCpuCulling;
+
 impl ViewTarget {
     pub const TEXTURE_FORMAT_HDR: TextureFormat = TextureFormat::Rgba16Float;
 
diff --git a/crates/bevy_render/src/view/visibility/mod.rs b/crates/bevy_render/src/view/visibility/mod.rs
index ee8ec2e7d46408..caa429806572de 100644
--- a/crates/bevy_render/src/view/visibility/mod.rs
+++ b/crates/bevy_render/src/view/visibility/mod.rs
@@ -20,6 +20,8 @@ use crate::{
     primitives::{Aabb, Frustum, Sphere},
 };
 
+use super::NoCpuCulling;
+
 /// User indication of whether an entity is visible. Propagates down the entity hierarchy.
 ///
 /// If an entity is hidden in this way, all [`Children`] (and all of their children and so on) who
@@ -376,6 +378,7 @@ pub fn check_visibility(
         &Frustum,
         Option<&RenderLayers>,
         &Camera,
+        Has<NoCpuCulling>,
     )>,
     mut visible_aabb_query: Query<(
         Entity,
@@ -387,7 +390,8 @@ pub fn check_visibility(
         Has<NoFrustumCulling>,
     )>,
 ) {
-    for (mut visible_entities, frustum, maybe_view_mask, camera) in &mut view_query {
+    for (mut visible_entities, frustum, maybe_view_mask, camera, no_cpu_culling) in &mut view_query
+    {
         if !camera.is_active {
             continue;
         }
@@ -418,7 +422,7 @@ pub fn check_visibility(
             }
 
             // If we have an aabb, do frustum culling
-            if !no_frustum_culling {
+            if !no_frustum_culling && !no_cpu_culling {
                 if let Some(model_aabb) = maybe_model_aabb {
                     let model = transform.affine();
                     let model_sphere = Sphere {
diff --git a/crates/bevy_sprite/src/mesh2d/material.rs b/crates/bevy_sprite/src/mesh2d/material.rs
index 9322b5f4b25a10..46a8c55fb786d7 100644
--- a/crates/bevy_sprite/src/mesh2d/material.rs
+++ b/crates/bevy_sprite/src/mesh2d/material.rs
@@ -17,8 +17,8 @@ use bevy_render::{
         prepare_assets, PrepareAssetError, RenderAsset, RenderAssetPlugin, RenderAssets,
     },
     render_phase::{
-        AddRenderCommand, DrawFunctions, PhaseItem, RenderCommand, RenderCommandResult,
-        SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
+        AddRenderCommand, DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand,
+        RenderCommandResult, SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
     },
     render_resource::{
         AsBindGroup, AsBindGroupError, BindGroup, BindGroupId, BindGroupLayout,
@@ -451,7 +451,7 @@ pub fn queue_material2d_meshes<M: Material2d>(
                 sort_key: FloatOrd(mesh_z + material2d.depth_bias),
                 // Batching is done in batch_and_prepare_render_phase
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 257ad61dfbb7f6..e86cd875a21fa6 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -653,7 +653,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMesh2dBindGroup<I> {
     ) -> RenderCommandResult {
         let mut dynamic_offsets: [u32; 1] = Default::default();
         let mut offset_count = 0;
-        if let Some(dynamic_offset) = item.dynamic_offset() {
+        if let Some(dynamic_offset) = item.extra_index().as_dynamic_offset() {
             dynamic_offsets[offset_count] = dynamic_offset.get();
             offset_count += 1;
         }
diff --git a/crates/bevy_sprite/src/render/mod.rs b/crates/bevy_sprite/src/render/mod.rs
index 202a68dbe20daa..5231177104f77a 100644
--- a/crates/bevy_sprite/src/render/mod.rs
+++ b/crates/bevy_sprite/src/render/mod.rs
@@ -19,8 +19,8 @@ use bevy_math::{Affine3A, FloatOrd, Quat, Rect, Vec2, Vec4};
 use bevy_render::{
     render_asset::RenderAssets,
     render_phase::{
-        DrawFunctions, PhaseItem, RenderCommand, RenderCommandResult, SetItemPipeline,
-        SortedRenderPhase, TrackedRenderPass,
+        DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand, RenderCommandResult,
+        SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
     },
     render_resource::{
         binding_types::{sampler, texture_2d, uniform_buffer},
@@ -512,7 +512,7 @@ pub fn queue_sprites(
                 sort_key,
                 // batch_range and dynamic_offset will be calculated in prepare_sprites
                 batch_range: 0..0,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_ui/src/render/mod.rs b/crates/bevy_ui/src/render/mod.rs
index c03b90004b7784..b03f78a6d0dc8a 100644
--- a/crates/bevy_ui/src/render/mod.rs
+++ b/crates/bevy_ui/src/render/mod.rs
@@ -7,8 +7,12 @@ use bevy_core_pipeline::core_2d::graph::{Core2d, Node2d};
 use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
 use bevy_core_pipeline::{core_2d::Camera2d, core_3d::Camera3d};
 use bevy_hierarchy::Parent;
-use bevy_render::texture::GpuImage;
-use bevy_render::{render_phase::PhaseItem, view::ViewVisibility, ExtractSchedule, Render};
+use bevy_render::{
+    render_phase::{PhaseItem, PhaseItemExtraIndex},
+    texture::GpuImage,
+    view::ViewVisibility,
+    ExtractSchedule, Render,
+};
 use bevy_sprite::{SpriteAssetEvents, TextureAtlas};
 pub use pipeline::*;
 pub use render_pass::*;
@@ -901,7 +905,7 @@ pub fn queue_uinodes(
             ),
             // batch_range will be calculated in prepare_uinodes
             batch_range: 0..0,
-            dynamic_offset: None,
+            extra_index: PhaseItemExtraIndex::NONE,
         });
     }
 }
diff --git a/crates/bevy_ui/src/render/render_pass.rs b/crates/bevy_ui/src/render/render_pass.rs
index e398a46d93d24b..d403a44bedc7f4 100644
--- a/crates/bevy_ui/src/render/render_pass.rs
+++ b/crates/bevy_ui/src/render/render_pass.rs
@@ -15,7 +15,6 @@ use bevy_render::{
     renderer::*,
     view::*,
 };
-use nonmax::NonMaxU32;
 
 pub struct UiPassNode {
     ui_view_query: QueryState<
@@ -92,7 +91,7 @@ pub struct TransparentUi {
     pub pipeline: CachedRenderPipelineId,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for TransparentUi {
@@ -117,13 +116,13 @@ impl PhaseItem for TransparentUi {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
diff --git a/crates/bevy_ui/src/render/ui_material_pipeline.rs b/crates/bevy_ui/src/render/ui_material_pipeline.rs
index b4d13ac65ac136..b1beaaa5d11c66 100644
--- a/crates/bevy_ui/src/render/ui_material_pipeline.rs
+++ b/crates/bevy_ui/src/render/ui_material_pipeline.rs
@@ -671,7 +671,7 @@ pub fn queue_ui_material_nodes<M: UiMaterial>(
                 entity.index(),
             ),
             batch_range: 0..0,
-            dynamic_offset: None,
+            extra_index: PhaseItemExtraIndex::NONE,
         });
     }
 }
diff --git a/examples/2d/mesh2d_manual.rs b/examples/2d/mesh2d_manual.rs
index 462b0bbfd6c4d4..5bdc857475880b 100644
--- a/examples/2d/mesh2d_manual.rs
+++ b/examples/2d/mesh2d_manual.rs
@@ -13,7 +13,10 @@ use bevy::{
     render::{
         mesh::{GpuMesh, Indices, MeshVertexAttribute},
         render_asset::{RenderAssetUsages, RenderAssets},
-        render_phase::{AddRenderCommand, DrawFunctions, SetItemPipeline, SortedRenderPhase},
+        render_phase::{
+            AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline,
+            SortedRenderPhase,
+        },
         render_resource::{
             BlendState, ColorTargetState, ColorWrites, Face, FragmentState, FrontFace,
             MultisampleState, PipelineCache, PolygonMode, PrimitiveState, PrimitiveTopology,
@@ -399,7 +402,7 @@ pub fn queue_colored_mesh2d(
                     sort_key: FloatOrd(mesh_z),
                     // This material is not batched
                     batch_range: 0..1,
-                    dynamic_offset: None,
+                    extra_index: PhaseItemExtraIndex::NONE,
                 });
             }
         }
diff --git a/examples/3d/deferred_rendering.rs b/examples/3d/deferred_rendering.rs
index 165d1de9098fc5..27ce850fb46741 100644
--- a/examples/3d/deferred_rendering.rs
+++ b/examples/3d/deferred_rendering.rs
@@ -7,11 +7,12 @@ use bevy::{
         fxaa::Fxaa,
         prepass::{DeferredPrepass, DepthPrepass, MotionVectorPrepass, NormalPrepass},
     },
-    pbr::NotShadowReceiver,
-    pbr::{CascadeShadowConfigBuilder, DirectionalLightShadowMap},
-    pbr::{DefaultOpaqueRendererMethod, NotShadowCaster, OpaqueRendererMethod},
+    pbr::{
+        CascadeShadowConfigBuilder, DefaultOpaqueRendererMethod, DirectionalLightShadowMap,
+        NotShadowCaster, NotShadowReceiver, OpaqueRendererMethod,
+    },
     prelude::*,
-    render::render_resource::TextureFormat,
+    render::{render_resource::TextureFormat, view::GpuCulling},
 };
 
 fn main() {
@@ -64,6 +65,7 @@ fn setup(
         MotionVectorPrepass,
         DeferredPrepass,
         Fxaa::default(),
+        GpuCulling,
     ));
 
     commands.spawn(DirectionalLightBundle {
diff --git a/examples/3d/lighting.rs b/examples/3d/lighting.rs
index 001b66bc8d415c..e9939318227d2c 100644
--- a/examples/3d/lighting.rs
+++ b/examples/3d/lighting.rs
@@ -7,7 +7,7 @@ use bevy::{
     color::palettes::css::*,
     pbr::CascadeShadowConfigBuilder,
     prelude::*,
-    render::camera::{Exposure, PhysicalCameraParameters},
+    render::{camera::{Exposure, PhysicalCameraParameters}, view::GpuCulling},
 };
 
 fn main() {
@@ -273,7 +273,7 @@ fn setup(
         transform: Transform::from_xyz(-2.0, 2.5, 5.0).looking_at(Vec3::ZERO, Vec3::Y),
         exposure: Exposure::from_physical_camera(**parameters),
         ..default()
-    });
+    }).insert(GpuCulling);
 }
 
 fn update_exposure(
diff --git a/examples/shader/shader_instancing.rs b/examples/shader/shader_instancing.rs
index b1c91c8973b070..17fd7823c95ad4 100644
--- a/examples/shader/shader_instancing.rs
+++ b/examples/shader/shader_instancing.rs
@@ -15,8 +15,8 @@ use bevy::{
         mesh::{GpuBufferInfo, GpuMesh, MeshVertexBufferLayoutRef},
         render_asset::RenderAssets,
         render_phase::{
-            AddRenderCommand, DrawFunctions, PhaseItem, RenderCommand, RenderCommandResult,
-            SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
+            AddRenderCommand, DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand,
+            RenderCommandResult, SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
         },
         render_resource::*,
         renderer::RenderDevice,
@@ -144,7 +144,7 @@ fn queue_custom(
                 draw_function: draw_custom,
                 distance: rangefinder.distance_translation(&mesh_instance.translation),
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/examples/stress_tests/many_cubes.rs b/examples/stress_tests/many_cubes.rs
index 510a83612a6d1b..0cc216025b15ca 100644
--- a/examples/stress_tests/many_cubes.rs
+++ b/examples/stress_tests/many_cubes.rs
@@ -20,7 +20,7 @@ use bevy::{
         batching::NoAutomaticBatching,
         render_asset::RenderAssetUsages,
         render_resource::{Extent3d, TextureDimension, TextureFormat},
-        view::NoFrustumCulling,
+        view::{GpuCulling, NoCpuCulling, NoFrustumCulling},
     },
     window::{PresentMode, WindowResolution},
     winit::{UpdateMode, WinitSettings},
@@ -51,7 +51,7 @@ struct Args {
     #[argh(option, default = "1")]
     mesh_count: usize,
 
-    /// whether to disable frustum culling. Stresses queuing and batching as all mesh material entities in the scene are always drawn.
+    /// whether to disable all frustum culling. Stresses queuing and batching as all mesh material entities in the scene are always drawn.
     #[argh(switch)]
     no_frustum_culling: bool,
 
@@ -59,6 +59,14 @@ struct Args {
     #[argh(switch)]
     no_automatic_batching: bool,
 
+    /// whether to enable GPU culling.
+    #[argh(switch)]
+    gpu_culling: bool,
+
+    /// whether to disable CPU culling.
+    #[argh(switch)]
+    no_cpu_culling: bool,
+
     /// whether to enable directional light cascaded shadow mapping.
     #[argh(switch)]
     shadows: bool,
@@ -172,7 +180,14 @@ fn setup(
             }
 
             // camera
-            commands.spawn(Camera3dBundle::default());
+            let mut camera = commands.spawn(Camera3dBundle::default());
+            if args.gpu_culling {
+                camera.insert(GpuCulling);
+            }
+            if args.no_cpu_culling {
+                camera.insert(NoCpuCulling);
+            }
+
             // Inside-out box around the meshes onto which shadows are cast (though you cannot see them...)
             commands.spawn((
                 PbrBundle {