From 16531fb3e3c3ff58e3e41b598ae14d960f71a0d0 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sun, 28 Apr 2024 07:50:00 -0500
Subject: [PATCH] Implement GPU frustum culling. (#12889)

This commit implements opt-in GPU frustum culling, built on top of the
infrastructure in https://github.com/bevyengine/bevy/pull/12773. To
enable it on a camera, add the `GpuCulling` component to it. To
additionally disable CPU frustum culling, add the `NoCpuCulling`
component. Note that adding `GpuCulling` without `NoCpuCulling`
*currently* does nothing useful. The reason why `GpuCulling` doesn't
automatically imply `NoCpuCulling` is that I intend to follow this patch
up with GPU two-phase occlusion culling, and CPU frustum culling plus
GPU occlusion culling seems like a very commonly-desired mode.

Adding the `GpuCulling` component to a view puts that view into
*indirect mode*. This mode makes all drawcalls indirect, relying on the
mesh preprocessing shader to allocate instances dynamically. In indirect
mode, the `PreprocessWorkItem` `output_index` points not to a
`MeshUniform` instance slot but instead to a set of `wgpu`
`IndirectParameters`, from which it allocates an instance slot
dynamically if frustum culling succeeds. Batch building has been updated
to allocate and track indirect parameter slots, and the AABBs are now
supplied to the GPU as `MeshCullingData`.

A small amount of code relating to the frustum culling has been borrowed
from meshlets and moved into `maths.wgsl`. Note that standard Bevy
frustum culling uses AABBs, while meshlets use bounding spheres; this
means that not as much code can be shared as one might think.

This patch doesn't provide any way to perform GPU culling on shadow
maps, to avoid making this patch bigger than it already is. That can be
a followup.

## Changelog

### Added

* Frustum culling can now optionally be done on the GPU. To enable it,
add the `GpuCulling` component to a camera.
* To disable CPU frustum culling, add `NoCpuCulling` to a camera. Note
that `GpuCulling` doesn't automatically imply `NoCpuCulling`.
---
 crates/bevy_core_pipeline/src/core_2d/mod.rs  |  13 +-
 crates/bevy_core_pipeline/src/core_3d/mod.rs  |  57 +-
 crates/bevy_core_pipeline/src/deferred/mod.rs |  34 +-
 crates/bevy_core_pipeline/src/prepass/mod.rs  |  34 +-
 crates/bevy_gizmos/src/pipeline_2d.rs         |   8 +-
 crates/bevy_gizmos/src/pipeline_3d.rs         |   8 +-
 crates/bevy_pbr/src/material.rs               |   6 +-
 crates/bevy_pbr/src/render/gpu_preprocess.rs  | 325 ++++++++---
 crates/bevy_pbr/src/render/light.rs           |  15 +-
 crates/bevy_pbr/src/render/mesh.rs            | 537 ++++++++++++++----
 .../bevy_pbr/src/render/mesh_preprocess.wgsl  | 123 +++-
 .../src/batching/gpu_preprocessing.rs         | 501 +++++++++++++---
 crates/bevy_render/src/batching/mod.rs        |  17 +-
 .../src/batching/no_gpu_preprocessing.rs      |  23 +-
 crates/bevy_render/src/camera/camera.rs       |  21 +-
 crates/bevy_render/src/lib.rs                 |   2 +
 crates/bevy_render/src/maths.wgsl             |  18 +
 crates/bevy_render/src/render_phase/mod.rs    | 347 ++++++++---
 .../src/render_resource/buffer_vec.rs         |   5 +
 crates/bevy_render/src/view/mod.rs            |   6 +
 crates/bevy_render/src/view/visibility/mod.rs |   8 +-
 crates/bevy_sprite/src/mesh2d/material.rs     |   6 +-
 crates/bevy_sprite/src/mesh2d/mesh.rs         |   2 +-
 crates/bevy_sprite/src/render/mod.rs          |   6 +-
 crates/bevy_ui/src/render/mod.rs              |  10 +-
 crates/bevy_ui/src/render/render_pass.rs      |  11 +-
 .../src/render/ui_material_pipeline.rs        |   2 +-
 examples/2d/mesh2d_manual.rs                  |   7 +-
 examples/3d/deferred_rendering.rs             |   7 +-
 examples/shader/shader_instancing.rs          |   6 +-
 examples/stress_tests/many_cubes.rs           |  21 +-
 31 files changed, 1706 insertions(+), 480 deletions(-)

diff --git a/crates/bevy_core_pipeline/src/core_2d/mod.rs b/crates/bevy_core_pipeline/src/core_2d/mod.rs
index 48d2bb5a4f609..85f986c111dd4 100644
--- a/crates/bevy_core_pipeline/src/core_2d/mod.rs
+++ b/crates/bevy_core_pipeline/src/core_2d/mod.rs
@@ -38,12 +38,11 @@ use bevy_render::{
     render_graph::{EmptyNode, RenderGraphApp, ViewNodeRunner},
     render_phase::{
         sort_phase_system, CachedRenderPipelinePhaseItem, DrawFunctionId, DrawFunctions, PhaseItem,
-        SortedPhaseItem, SortedRenderPhase,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
     },
     render_resource::CachedRenderPipelineId,
     Extract, ExtractSchedule, Render, RenderApp, RenderSet,
 };
-use nonmax::NonMaxU32;
 
 use crate::{tonemapping::TonemappingNode, upscaling::UpscalingNode};
 
@@ -91,7 +90,7 @@ pub struct Transparent2d {
     pub pipeline: CachedRenderPipelineId,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Transparent2d {
@@ -116,13 +115,13 @@ impl PhaseItem for Transparent2d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
diff --git a/crates/bevy_core_pipeline/src/core_3d/mod.rs b/crates/bevy_core_pipeline/src/core_3d/mod.rs
index 94bc10b4f3501..e5cd6c049173f 100644
--- a/crates/bevy_core_pipeline/src/core_3d/mod.rs
+++ b/crates/bevy_core_pipeline/src/core_3d/mod.rs
@@ -58,7 +58,8 @@ use bevy_render::{
     render_graph::{EmptyNode, RenderGraphApp, ViewNodeRunner},
     render_phase::{
         sort_phase_system, BinnedPhaseItem, BinnedRenderPhase, CachedRenderPipelinePhaseItem,
-        DrawFunctionId, DrawFunctions, PhaseItem, SortedPhaseItem, SortedRenderPhase,
+        DrawFunctionId, DrawFunctions, PhaseItem, PhaseItemExtraIndex, SortedPhaseItem,
+        SortedRenderPhase,
     },
     render_resource::{
         BindGroupId, CachedRenderPipelineId, Extent3d, FilterMode, Sampler, SamplerDescriptor,
@@ -70,7 +71,6 @@ use bevy_render::{
     Extract, ExtractSchedule, Render, RenderApp, RenderSet,
 };
 use bevy_utils::{tracing::warn, HashMap};
-use nonmax::NonMaxU32;
 
 use crate::{
     core_3d::main_transmissive_pass_3d_node::MainTransmissivePass3dNode,
@@ -183,8 +183,9 @@ pub struct Opaque3d {
     pub representative_entity: Entity,
     /// The ranges of instances.
     pub batch_range: Range<u32>,
-    /// The dynamic offset.
-    pub dynamic_offset: Option<NonMaxU32>,
+    /// An extra index, which is either a dynamic offset or an index in the
+    /// indirect parameters list.
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 /// Data that must be identical in order to batch meshes together.
@@ -229,14 +230,12 @@ impl PhaseItem for Opaque3d {
         &mut self.batch_range
     }
 
-    #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
-    #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -248,13 +247,13 @@ impl BinnedPhaseItem for Opaque3d {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Opaque3d {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -270,7 +269,7 @@ pub struct AlphaMask3d {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for AlphaMask3d {
@@ -295,13 +294,13 @@ impl PhaseItem for AlphaMask3d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -313,13 +312,13 @@ impl BinnedPhaseItem for AlphaMask3d {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -337,7 +336,7 @@ pub struct Transmissive3d {
     pub entity: Entity,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Transmissive3d {
@@ -373,13 +372,13 @@ impl PhaseItem for Transmissive3d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -411,7 +410,7 @@ pub struct Transparent3d {
     pub entity: Entity,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Transparent3d {
@@ -436,13 +435,13 @@ impl PhaseItem for Transparent3d {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
diff --git a/crates/bevy_core_pipeline/src/deferred/mod.rs b/crates/bevy_core_pipeline/src/deferred/mod.rs
index 3ccd8caad0e12..1372731820224 100644
--- a/crates/bevy_core_pipeline/src/deferred/mod.rs
+++ b/crates/bevy_core_pipeline/src/deferred/mod.rs
@@ -5,10 +5,12 @@ use std::ops::Range;
 
 use bevy_ecs::prelude::*;
 use bevy_render::{
-    render_phase::{BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem},
+    render_phase::{
+        BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem,
+        PhaseItemExtraIndex,
+    },
     render_resource::{CachedRenderPipelineId, TextureFormat},
 };
-use nonmax::NonMaxU32;
 
 use crate::prepass::OpaqueNoLightmap3dBinKey;
 
@@ -26,7 +28,7 @@ pub struct Opaque3dDeferred {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for Opaque3dDeferred {
@@ -51,13 +53,13 @@ impl PhaseItem for Opaque3dDeferred {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -69,13 +71,13 @@ impl BinnedPhaseItem for Opaque3dDeferred {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -96,7 +98,7 @@ pub struct AlphaMask3dDeferred {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for AlphaMask3dDeferred {
@@ -121,13 +123,13 @@ impl PhaseItem for AlphaMask3dDeferred {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -138,13 +140,13 @@ impl BinnedPhaseItem for AlphaMask3dDeferred {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
diff --git a/crates/bevy_core_pipeline/src/prepass/mod.rs b/crates/bevy_core_pipeline/src/prepass/mod.rs
index 670eca9e8e2d2..88a6ac3970539 100644
--- a/crates/bevy_core_pipeline/src/prepass/mod.rs
+++ b/crates/bevy_core_pipeline/src/prepass/mod.rs
@@ -34,11 +34,13 @@ use bevy_ecs::prelude::*;
 use bevy_reflect::Reflect;
 use bevy_render::{
     mesh::Mesh,
-    render_phase::{BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem},
+    render_phase::{
+        BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, PhaseItem,
+        PhaseItemExtraIndex,
+    },
     render_resource::{BindGroupId, CachedRenderPipelineId, Extent3d, TextureFormat, TextureView},
     texture::ColorAttachment,
 };
-use nonmax::NonMaxU32;
 
 pub const NORMAL_PREPASS_FORMAT: TextureFormat = TextureFormat::Rgb10a2Unorm;
 pub const MOTION_VECTOR_PREPASS_FORMAT: TextureFormat = TextureFormat::Rg16Float;
@@ -119,7 +121,7 @@ pub struct Opaque3dPrepass {
     pub representative_entity: Entity,
 
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 // TODO: Try interning these.
@@ -163,13 +165,13 @@ impl PhaseItem for Opaque3dPrepass {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -181,13 +183,13 @@ impl BinnedPhaseItem for Opaque3dPrepass {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Opaque3dPrepass {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
@@ -208,7 +210,7 @@ pub struct AlphaMask3dPrepass {
     pub key: OpaqueNoLightmap3dBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for AlphaMask3dPrepass {
@@ -233,13 +235,13 @@ impl PhaseItem for AlphaMask3dPrepass {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -251,13 +253,13 @@ impl BinnedPhaseItem for AlphaMask3dPrepass {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Self {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
diff --git a/crates/bevy_gizmos/src/pipeline_2d.rs b/crates/bevy_gizmos/src/pipeline_2d.rs
index 4fbf9544e22b5..660cec02c92d3 100644
--- a/crates/bevy_gizmos/src/pipeline_2d.rs
+++ b/crates/bevy_gizmos/src/pipeline_2d.rs
@@ -18,7 +18,9 @@ use bevy_ecs::{
 use bevy_math::FloatOrd;
 use bevy_render::{
     render_asset::{prepare_assets, RenderAssets},
-    render_phase::{AddRenderCommand, DrawFunctions, SetItemPipeline, SortedRenderPhase},
+    render_phase::{
+        AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline, SortedRenderPhase,
+    },
     render_resource::*,
     texture::BevyDefault,
     view::{ExtractedView, Msaa, RenderLayers, ViewTarget},
@@ -293,7 +295,7 @@ fn queue_line_gizmos_2d(
                 pipeline,
                 sort_key: FloatOrd(f32::INFINITY),
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
@@ -351,7 +353,7 @@ fn queue_line_joint_gizmos_2d(
                 pipeline,
                 sort_key: FloatOrd(f32::INFINITY),
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_gizmos/src/pipeline_3d.rs b/crates/bevy_gizmos/src/pipeline_3d.rs
index ec9800330d1b7..e247220d541bc 100644
--- a/crates/bevy_gizmos/src/pipeline_3d.rs
+++ b/crates/bevy_gizmos/src/pipeline_3d.rs
@@ -22,7 +22,9 @@ use bevy_ecs::{
 use bevy_pbr::{MeshPipeline, MeshPipelineKey, SetMeshViewBindGroup};
 use bevy_render::{
     render_asset::{prepare_assets, RenderAssets},
-    render_phase::{AddRenderCommand, DrawFunctions, SetItemPipeline, SortedRenderPhase},
+    render_phase::{
+        AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline, SortedRenderPhase,
+    },
     render_resource::*,
     texture::BevyDefault,
     view::{ExtractedView, Msaa, RenderLayers, ViewTarget},
@@ -348,7 +350,7 @@ fn queue_line_gizmos_3d(
                 pipeline,
                 distance: 0.,
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
@@ -437,7 +439,7 @@ fn queue_line_joint_gizmos_3d(
                 pipeline,
                 distance: 0.,
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index 395cb1c6e1746..1309934a1e3b0 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -710,7 +710,7 @@ pub fn queue_material_meshes<M: Material>(
                             pipeline: pipeline_id,
                             distance,
                             batch_range: 0..1,
-                            dynamic_offset: None,
+                            extra_index: PhaseItemExtraIndex::NONE,
                         });
                     } else if material.properties.render_method == OpaqueRendererMethod::Forward {
                         let bin_key = Opaque3dBinKey {
@@ -734,7 +734,7 @@ pub fn queue_material_meshes<M: Material>(
                             pipeline: pipeline_id,
                             distance,
                             batch_range: 0..1,
-                            dynamic_offset: None,
+                            extra_index: PhaseItemExtraIndex::NONE,
                         });
                     } else if material.properties.render_method == OpaqueRendererMethod::Forward {
                         let bin_key = OpaqueNoLightmap3dBinKey {
@@ -759,7 +759,7 @@ pub fn queue_material_meshes<M: Material>(
                         pipeline: pipeline_id,
                         distance,
                         batch_range: 0..1,
-                        dynamic_offset: None,
+                        extra_index: PhaseItemExtraIndex::NONE,
                     });
                 }
             }
diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 21eff19668a65..8e32e96678ad6 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -14,27 +14,35 @@ use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
 use bevy_ecs::{
     component::Component,
     entity::Entity,
-    query::QueryState,
+    query::{Has, QueryState},
     schedule::{common_conditions::resource_exists, IntoSystemConfigs as _},
     system::{lifetimeless::Read, Commands, Res, ResMut, Resource},
     world::{FromWorld, World},
 };
 use bevy_render::{
-    batching::gpu_preprocessing::{self, BatchedInstanceBuffers, PreprocessWorkItem},
+    batching::gpu_preprocessing::{
+        BatchedInstanceBuffers, GpuPreprocessingSupport, IndirectParameters,
+        IndirectParametersBuffer, PreprocessWorkItem,
+    },
     render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
     render_resource::{
-        binding_types::{storage_buffer, storage_buffer_read_only},
+        binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer},
         BindGroup, BindGroupEntries, BindGroupLayout, BindingResource, BufferBinding,
         CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor,
         DynamicBindGroupLayoutEntries, PipelineCache, Shader, ShaderStages, ShaderType,
         SpecializedComputePipeline, SpecializedComputePipelines,
     },
-    renderer::{RenderContext, RenderDevice},
+    renderer::{RenderContext, RenderDevice, RenderQueue},
+    view::{GpuCulling, ViewUniform, ViewUniformOffset, ViewUniforms},
     Render, RenderApp, RenderSet,
 };
 use bevy_utils::tracing::warn;
+use bitflags::bitflags;
+use smallvec::{smallvec, SmallVec};
 
-use crate::{graph::NodePbr, MeshInputUniform, MeshUniform};
+use crate::{
+    graph::NodePbr, MeshCullingData, MeshCullingDataBuffer, MeshInputUniform, MeshUniform,
+};
 
 /// The handle to the `mesh_preprocess.wgsl` compute shader.
 pub const MESH_PREPROCESS_SHADER_HANDLE: Handle<Shader> =
@@ -57,20 +65,46 @@ pub struct GpuMeshPreprocessPlugin {
 
 /// The render node for the mesh uniform building pass.
 pub struct GpuPreprocessNode {
-    view_query: QueryState<(Entity, Read<PreprocessBindGroup>)>,
+    view_query: QueryState<(
+        Entity,
+        Read<PreprocessBindGroup>,
+        Read<ViewUniformOffset>,
+        Has<GpuCulling>,
+    )>,
 }
 
-/// The compute shader pipeline for the mesh uniform building pass.
+/// The compute shader pipelines for the mesh uniform building pass.
 #[derive(Resource)]
+pub struct PreprocessPipelines {
+    /// The pipeline used for CPU culling. This pipeline doesn't populate
+    /// indirect parameters.
+    pub direct: PreprocessPipeline,
+    /// The pipeline used for GPU culling. This pipeline populates indirect
+    /// parameters.
+    pub gpu_culling: PreprocessPipeline,
+}
+
+/// The pipeline for the GPU mesh preprocessing shader.
 pub struct PreprocessPipeline {
-    /// The single bind group layout for the compute shader.
+    /// The bind group layout for the compute shader.
     pub bind_group_layout: BindGroupLayout,
     /// The pipeline ID for the compute shader.
     ///
-    /// This gets filled in in `prepare_preprocess_pipeline`.
+    /// This gets filled in in `prepare_preprocess_pipelines`.
     pub pipeline_id: Option<CachedComputePipelineId>,
 }
 
+bitflags! {
+    /// Specifies variants of the mesh preprocessing shader.
+    #[derive(Clone, Copy, PartialEq, Eq, Hash)]
+    pub struct PreprocessPipelineKey: u8 {
+        /// Whether GPU culling is in use.
+        ///
+        /// This `#define`'s `GPU_CULLING` in the shader.
+        const GPU_CULLING = 1;
+    }
+}
+
 /// The compute shader bind group for the mesh uniform building pass.
 ///
 /// This goes on the view.
@@ -94,9 +128,9 @@ impl Plugin for GpuMeshPreprocessPlugin {
 
         // This plugin does nothing if GPU instance buffer building isn't in
         // use.
-        let render_device = render_app.world().resource::<RenderDevice>();
+        let gpu_preprocessing_support = render_app.world().resource::<GpuPreprocessingSupport>();
         if !self.use_gpu_instance_buffer_builder
-            || !gpu_preprocessing::can_preprocess_on_gpu(render_device)
+            || *gpu_preprocessing_support == GpuPreprocessingSupport::None
         {
             return;
         }
@@ -106,17 +140,18 @@ impl Plugin for GpuMeshPreprocessPlugin {
             .add_render_graph_node::<GpuPreprocessNode>(Core3d, NodePbr::GpuPreprocess)
             .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, Node3d::Prepass))
             .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, NodePbr::ShadowPass))
-            .init_resource::<PreprocessPipeline>()
+            .init_resource::<PreprocessPipelines>()
             .init_resource::<SpecializedComputePipelines<PreprocessPipeline>>()
             .add_systems(
                 Render,
                 (
-                    prepare_preprocess_pipeline.in_set(RenderSet::Prepare),
+                    prepare_preprocess_pipelines.in_set(RenderSet::Prepare),
                     prepare_preprocess_bind_groups
                         .run_if(
                             resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
                         )
                         .in_set(RenderSet::PrepareBindGroups),
+                    write_mesh_culling_data_buffer.in_set(RenderSet::PrepareResourcesFlush),
                 )
             );
     }
@@ -148,18 +183,7 @@ impl Node for GpuPreprocessNode {
         } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
 
         let pipeline_cache = world.resource::<PipelineCache>();
-        let preprocess_pipeline = world.resource::<PreprocessPipeline>();
-
-        let Some(preprocess_pipeline_id) = preprocess_pipeline.pipeline_id else {
-            warn!("The build mesh uniforms pipeline wasn't created");
-            return Ok(());
-        };
-
-        let Some(preprocess_pipeline) = pipeline_cache.get_compute_pipeline(preprocess_pipeline_id)
-        else {
-            // This will happen while the pipeline is being compiled and is fine.
-            return Ok(());
-        };
+        let preprocess_pipelines = world.resource::<PreprocessPipelines>();
 
         let mut compute_pass =
             render_context
@@ -169,13 +193,46 @@ impl Node for GpuPreprocessNode {
                     timestamp_writes: None,
                 });
 
-        compute_pass.set_pipeline(preprocess_pipeline);
-
         // Run the compute passes.
-        for (view, bind_group) in self.view_query.iter_manual(world) {
-            let index_buffer = &index_buffers[&view];
-            compute_pass.set_bind_group(0, &bind_group.0, &[]);
-            let workgroup_count = index_buffer.len().div_ceil(WORKGROUP_SIZE);
+        for (view, bind_group, view_uniform_offset, gpu_culling) in
+            self.view_query.iter_manual(world)
+        {
+            // Grab the index buffer for this view.
+            let Some(index_buffer) = index_buffers.get(&view) else {
+                warn!("The preprocessing index buffer wasn't present");
+                return Ok(());
+            };
+
+            // Select the right pipeline, depending on whether GPU culling is in
+            // use.
+            let maybe_pipeline_id = if gpu_culling {
+                preprocess_pipelines.gpu_culling.pipeline_id
+            } else {
+                preprocess_pipelines.direct.pipeline_id
+            };
+
+            // Fetch the pipeline.
+            let Some(preprocess_pipeline_id) = maybe_pipeline_id else {
+                warn!("The build mesh uniforms pipeline wasn't ready");
+                return Ok(());
+            };
+
+            let Some(preprocess_pipeline) =
+                pipeline_cache.get_compute_pipeline(preprocess_pipeline_id)
+            else {
+                // This will happen while the pipeline is being compiled and is fine.
+                return Ok(());
+            };
+
+            compute_pass.set_pipeline(preprocess_pipeline);
+
+            let mut dynamic_offsets: SmallVec<[u32; 1]> = smallvec![];
+            if gpu_culling {
+                dynamic_offsets.push(view_uniform_offset.offset);
+            }
+            compute_pass.set_bind_group(0, &bind_group.0, &dynamic_offsets);
+
+            let workgroup_count = index_buffer.buffer.len().div_ceil(WORKGROUP_SIZE);
             compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
         }
 
@@ -183,72 +240,149 @@ impl Node for GpuPreprocessNode {
     }
 }
 
+impl PreprocessPipelines {
+    pub(crate) fn pipelines_are_loaded(&self, pipeline_cache: &PipelineCache) -> bool {
+        self.direct.is_loaded(pipeline_cache) && self.gpu_culling.is_loaded(pipeline_cache)
+    }
+}
+
+impl PreprocessPipeline {
+    fn is_loaded(&self, pipeline_cache: &PipelineCache) -> bool {
+        self.pipeline_id
+            .is_some_and(|pipeline_id| pipeline_cache.get_compute_pipeline(pipeline_id).is_some())
+    }
+}
+
 impl SpecializedComputePipeline for PreprocessPipeline {
-    type Key = ();
+    type Key = PreprocessPipelineKey;
+
+    fn specialize(&self, key: Self::Key) -> ComputePipelineDescriptor {
+        let mut shader_defs = vec![];
+        if key.contains(PreprocessPipelineKey::GPU_CULLING) {
+            shader_defs.push("INDIRECT".into());
+            shader_defs.push("FRUSTUM_CULLING".into());
+        }
 
-    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
         ComputePipelineDescriptor {
-            label: Some("mesh preprocessing".into()),
+            label: Some(
+                format!(
+                    "mesh preprocessing ({})",
+                    if key.contains(PreprocessPipelineKey::GPU_CULLING) {
+                        "GPU culling"
+                    } else {
+                        "direct"
+                    }
+                )
+                .into(),
+            ),
             layout: vec![self.bind_group_layout.clone()],
             push_constant_ranges: vec![],
             shader: MESH_PREPROCESS_SHADER_HANDLE,
-            shader_defs: vec![],
+            shader_defs,
             entry_point: "main".into(),
         }
     }
 }
 
-impl FromWorld for PreprocessPipeline {
+impl FromWorld for PreprocessPipelines {
     fn from_world(world: &mut World) -> Self {
         let render_device = world.resource::<RenderDevice>();
 
-        let bind_group_layout_entries = DynamicBindGroupLayoutEntries::sequential(
-            ShaderStages::COMPUTE,
-            (
-                // `current_input`
-                storage_buffer_read_only::<MeshInputUniform>(false),
-                // `previous_input`
-                storage_buffer_read_only::<MeshInputUniform>(false),
-                // `indices`
-                storage_buffer_read_only::<PreprocessWorkItem>(false),
-                // `output`
-                storage_buffer::<MeshUniform>(false),
-            ),
+        // GPU culling bind group parameters are a superset of those in the CPU
+        // culling (direct) shader.
+        let direct_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries();
+        let gpu_culling_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries()
+            .extend_sequential((
+                // `indirect_parameters`
+                storage_buffer::<IndirectParameters>(/*has_dynamic_offset=*/ false),
+                // `mesh_culling_data`
+                storage_buffer_read_only::<MeshCullingData>(/*has_dynamic_offset=*/ false),
+                // `view`
+                uniform_buffer::<ViewUniform>(/*has_dynamic_offset=*/ true),
+            ));
+
+        let direct_bind_group_layout = render_device.create_bind_group_layout(
+            "build mesh uniforms direct bind group layout",
+            &direct_bind_group_layout_entries,
         );
-
-        let bind_group_layout = render_device.create_bind_group_layout(
-            "build mesh uniforms bind group layout",
-            &bind_group_layout_entries,
+        let gpu_culling_bind_group_layout = render_device.create_bind_group_layout(
+            "build mesh uniforms GPU culling bind group layout",
+            &gpu_culling_bind_group_layout_entries,
         );
 
-        PreprocessPipeline {
-            bind_group_layout,
-            pipeline_id: None,
+        PreprocessPipelines {
+            direct: PreprocessPipeline {
+                bind_group_layout: direct_bind_group_layout,
+                pipeline_id: None,
+            },
+            gpu_culling: PreprocessPipeline {
+                bind_group_layout: gpu_culling_bind_group_layout,
+                pipeline_id: None,
+            },
         }
     }
 }
 
-/// A system that specializes the `mesh_preprocess.wgsl` pipeline if necessary.
-pub fn prepare_preprocess_pipeline(
+fn preprocess_direct_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries {
+    DynamicBindGroupLayoutEntries::sequential(
+        ShaderStages::COMPUTE,
+        (
+            // `current_input`
+            storage_buffer_read_only::<MeshInputUniform>(false),
+            // `previous_input`
+            storage_buffer_read_only::<MeshInputUniform>(false),
+            // `indices`
+            storage_buffer_read_only::<PreprocessWorkItem>(false),
+            // `output`
+            storage_buffer::<MeshUniform>(false),
+        ),
+    )
+}
+
+/// A system that specializes the `mesh_preprocess.wgsl` pipelines if necessary.
+pub fn prepare_preprocess_pipelines(
     pipeline_cache: Res<PipelineCache>,
     mut pipelines: ResMut<SpecializedComputePipelines<PreprocessPipeline>>,
-    mut preprocess_pipeline: ResMut<PreprocessPipeline>,
+    mut preprocess_pipelines: ResMut<PreprocessPipelines>,
 ) {
-    if preprocess_pipeline.pipeline_id.is_some() {
-        return;
-    }
+    preprocess_pipelines.direct.prepare(
+        &pipeline_cache,
+        &mut pipelines,
+        PreprocessPipelineKey::empty(),
+    );
+    preprocess_pipelines.gpu_culling.prepare(
+        &pipeline_cache,
+        &mut pipelines,
+        PreprocessPipelineKey::GPU_CULLING,
+    );
+}
 
-    let preprocess_pipeline_id = pipelines.specialize(&pipeline_cache, &preprocess_pipeline, ());
-    preprocess_pipeline.pipeline_id = Some(preprocess_pipeline_id);
+impl PreprocessPipeline {
+    fn prepare(
+        &mut self,
+        pipeline_cache: &PipelineCache,
+        pipelines: &mut SpecializedComputePipelines<PreprocessPipeline>,
+        key: PreprocessPipelineKey,
+    ) {
+        if self.pipeline_id.is_some() {
+            return;
+        }
+
+        let preprocess_pipeline_id = pipelines.specialize(pipeline_cache, self, key);
+        self.pipeline_id = Some(preprocess_pipeline_id);
+    }
 }
 
-/// A system that attaches the mesh uniform buffers to the bind group for the
-/// compute shader.
+/// A system that attaches the mesh uniform buffers to the bind groups for the
+/// variants of the mesh preprocessing compute shader.
 pub fn prepare_preprocess_bind_groups(
     mut commands: Commands,
     render_device: Res<RenderDevice>,
     batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
-    pipeline: Res<PreprocessPipeline>,
+    indirect_parameters_buffer: Res<IndirectParametersBuffer>,
+    mesh_culling_data_buffer: Res<MeshCullingDataBuffer>,
+    view_uniforms: Res<ViewUniforms>,
+    pipelines: Res<PreprocessPipelines>,
 ) {
     // Grab the `BatchedInstanceBuffers`.
     let BatchedInstanceBuffers {
@@ -267,7 +401,7 @@ pub fn prepare_preprocess_bind_groups(
     };
 
     for (view, index_buffer_vec) in index_buffers {
-        let Some(index_buffer) = index_buffer_vec.buffer() else {
+        let Some(index_buffer) = index_buffer_vec.buffer.buffer() else {
             continue;
         };
 
@@ -275,15 +409,27 @@ pub fn prepare_preprocess_bind_groups(
         // length and the underlying buffer may be longer than the actual size
         // of the vector.
         let index_buffer_size = NonZeroU64::try_from(
-            index_buffer_vec.len() as u64 * u64::from(PreprocessWorkItem::min_size()),
+            index_buffer_vec.buffer.len() as u64 * u64::from(PreprocessWorkItem::min_size()),
         )
         .ok();
 
-        commands
-            .entity(*view)
-            .insert(PreprocessBindGroup(render_device.create_bind_group(
-                "preprocess_bind_group",
-                &pipeline.bind_group_layout,
+        let bind_group = if index_buffer_vec.gpu_culling {
+            let (
+                Some(indirect_parameters_buffer),
+                Some(mesh_culling_data_buffer),
+                Some(view_uniforms_binding),
+            ) = (
+                indirect_parameters_buffer.buffer(),
+                mesh_culling_data_buffer.buffer(),
+                view_uniforms.uniforms.binding(),
+            )
+            else {
+                continue;
+            };
+
+            PreprocessBindGroup(render_device.create_bind_group(
+                "preprocess_gpu_culling_bind_group",
+                &pipelines.gpu_culling.bind_group_layout,
                 &BindGroupEntries::sequential((
                     current_input_buffer.as_entire_binding(),
                     previous_input_buffer.as_entire_binding(),
@@ -293,7 +439,38 @@ pub fn prepare_preprocess_bind_groups(
                         size: index_buffer_size,
                     }),
                     data_buffer.as_entire_binding(),
+                    indirect_parameters_buffer.as_entire_binding(),
+                    mesh_culling_data_buffer.as_entire_binding(),
+                    view_uniforms_binding,
                 )),
-            )));
+            ))
+        } else {
+            PreprocessBindGroup(render_device.create_bind_group(
+                "preprocess_direct_bind_group",
+                &pipelines.direct.bind_group_layout,
+                &BindGroupEntries::sequential((
+                    current_input_buffer.as_entire_binding(),
+                    previous_input_buffer.as_entire_binding(),
+                    BindingResource::Buffer(BufferBinding {
+                        buffer: index_buffer,
+                        offset: 0,
+                        size: index_buffer_size,
+                    }),
+                    data_buffer.as_entire_binding(),
+                )),
+            ))
+        };
+
+        commands.entity(*view).insert(bind_group);
     }
 }
+
+/// Writes the information needed to do GPU mesh culling to the GPU.
+pub fn write_mesh_culling_data_buffer(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
+) {
+    mesh_culling_data_buffer.write_buffer(&render_device, &render_queue);
+    mesh_culling_data_buffer.clear();
+}
diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs
index e09b162197ef7..74e340c01a33e 100644
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@@ -22,7 +22,6 @@ use bevy_transform::{components::GlobalTransform, prelude::Transform};
 #[cfg(feature = "trace")]
 use bevy_utils::tracing::info_span;
 use bevy_utils::tracing::{error, warn};
-use nonmax::NonMaxU32;
 use std::{hash::Hash, num::NonZeroU64, ops::Range};
 
 use crate::*;
@@ -1734,7 +1733,7 @@ pub struct Shadow {
     pub key: ShadowBinKey,
     pub representative_entity: Entity,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
@@ -1771,13 +1770,13 @@ impl PhaseItem for Shadow {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
@@ -1789,13 +1788,13 @@ impl BinnedPhaseItem for Shadow {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self {
         Shadow {
             key,
             representative_entity,
             batch_range,
-            dynamic_offset,
+            extra_index,
         }
     }
 }
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 5d326e05909cc..fa3cbdee014f2 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -12,13 +12,17 @@ use bevy_ecs::{
     query::ROQueryItem,
     system::{lifetimeless::*, SystemParamItem, SystemState},
 };
-use bevy_math::{vec3, Affine3, Rect, UVec2, Vec3, Vec4};
+use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
     batching::{
-        gpu_preprocessing, no_gpu_preprocessing, GetBatchData, GetFullBatchData,
-        NoAutomaticBatching,
+        gpu_preprocessing::{
+            self, GpuPreprocessingSupport, IndirectParameters, IndirectParametersBuffer,
+        },
+        no_gpu_preprocessing, GetBatchData, GetFullBatchData, NoAutomaticBatching,
     },
+    camera::Camera,
     mesh::*,
+    primitives::Aabb,
     render_asset::RenderAssets,
     render_phase::{
         BinnedRenderPhasePlugin, PhaseItem, RenderCommand, RenderCommandResult,
@@ -27,11 +31,11 @@ use bevy_render::{
     render_resource::*,
     renderer::{RenderDevice, RenderQueue},
     texture::{BevyDefault, DefaultImageSampler, ImageSampler, TextureFormatPixelInfo},
-    view::{prepare_view_targets, ViewTarget, ViewUniformOffset, ViewVisibility},
+    view::{prepare_view_targets, GpuCulling, ViewTarget, ViewUniformOffset, ViewVisibility},
     Extract,
 };
 use bevy_transform::components::GlobalTransform;
-use bevy_utils::{tracing::error, Entry, HashMap, Parallel};
+use bevy_utils::{tracing::error, tracing::warn, Entry, HashMap, Parallel};
 
 #[cfg(debug_assertions)]
 use bevy_utils::warn_once;
@@ -139,6 +143,7 @@ impl Plugin for MeshRenderPlugin {
                 .init_resource::<SkinIndices>()
                 .init_resource::<MorphUniform>()
                 .init_resource::<MorphIndices>()
+                .init_resource::<MeshCullingDataBuffer>()
                 .add_systems(
                     ExtractSchedule,
                     (
@@ -167,9 +172,12 @@ impl Plugin for MeshRenderPlugin {
         let mut mesh_bindings_shader_defs = Vec::with_capacity(1);
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            let render_device = render_app.world().resource::<RenderDevice>();
+            render_app.init_resource::<GpuPreprocessingSupport>();
+
+            let gpu_preprocessing_support =
+                render_app.world().resource::<GpuPreprocessingSupport>();
             let use_gpu_instance_buffer_builder = self.use_gpu_instance_buffer_builder
-                && gpu_preprocessing::can_preprocess_on_gpu(render_device);
+                && *gpu_preprocessing_support != GpuPreprocessingSupport::None;
 
             let render_mesh_instances = RenderMeshInstances::new(use_gpu_instance_buffer_builder);
             render_app.insert_resource(render_mesh_instances);
@@ -209,6 +217,8 @@ impl Plugin for MeshRenderPlugin {
                     );
             };
 
+            let indirect_parameters_buffer = IndirectParametersBuffer::new();
+
             let render_device = render_app.world().resource::<RenderDevice>();
             if let Some(per_object_buffer_batch_size) =
                 GpuArrayBuffer::<MeshUniform>::batch_size(render_device)
@@ -219,7 +229,9 @@ impl Plugin for MeshRenderPlugin {
                 ));
             }
 
-            render_app.init_resource::<MeshPipeline>();
+            render_app
+                .insert_resource(indirect_parameters_buffer)
+                .init_resource::<MeshPipeline>();
         }
 
         // Load the mesh_bindings shader module here as it depends on runtime information about
@@ -295,6 +307,29 @@ pub struct MeshInputUniform {
     pub previous_input_index: u32,
 }
 
+/// Information about each mesh instance needed to cull it on GPU.
+///
+/// This consists of its axis-aligned bounding box (AABB).
+#[derive(ShaderType, Pod, Zeroable, Clone, Copy)]
+#[repr(C)]
+pub struct MeshCullingData {
+    /// The 3D center of the AABB in model space, padded with an extra unused
+    /// float value.
+    pub aabb_center: Vec4,
+    /// The 3D extents of the AABB in model space, divided by two, padded with
+    /// an extra unused float value.
+    pub aabb_half_extents: Vec4,
+}
+
+/// A GPU buffer that holds the information needed to cull meshes on GPU.
+///
+/// At the moment, this simply holds each mesh's AABB.
+///
+/// To avoid wasting CPU time in the CPU culling case, this buffer will be empty
+/// if GPU culling isn't in use.
+#[derive(Resource, Deref, DerefMut)]
+pub struct MeshCullingDataBuffer(BufferVec<MeshCullingData>);
+
 impl MeshUniform {
     pub fn new(mesh_transforms: &MeshTransforms, maybe_lightmap_uv_rect: Option<Rect>) -> Self {
         let (inverse_transpose_model_a, inverse_transpose_model_b) =
@@ -403,6 +438,57 @@ pub struct RenderMeshInstanceShared {
     pub flags: RenderMeshInstanceFlags,
 }
 
+/// Information that is gathered during the parallel portion of mesh extraction
+/// when GPU mesh uniform building is enabled.
+///
+/// From this, the [`MeshInputUniform`] and [`RenderMeshInstanceGpu`] are
+/// prepared.
+pub struct RenderMeshInstanceGpuBuilder {
+    /// Data that will be placed on the [`RenderMeshInstanceGpu`].
+    pub shared: RenderMeshInstanceShared,
+    /// The current transform.
+    pub transform: Affine3,
+    /// Four 16-bit unsigned normalized UV values packed into a [`UVec2`]:
+    ///
+    /// ```text
+    ///                         <--- MSB                   LSB --->
+    ///                         +---- min v ----+ +---- min u ----+
+    ///     lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu,
+    ///                         +---- max v ----+ +---- max u ----+
+    ///     lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU,
+    ///
+    /// (MSB: most significant bit; LSB: least significant bit.)
+    /// ```
+    pub lightmap_uv_rect: UVec2,
+    /// The index of the previous mesh input.
+    pub previous_input_index: Option<NonMaxU32>,
+    /// Various flags.
+    pub mesh_flags: MeshFlags,
+}
+
+/// The per-thread queues used during [`extract_meshes_for_gpu_building`].
+///
+/// There are two varieties of these: one for when culling happens on CPU and
+/// one for when culling happens on GPU. Having the two varieties avoids wasting
+/// space if GPU culling is disabled.
+#[derive(Default)]
+pub enum RenderMeshInstanceGpuQueue {
+    /// The default value.
+    ///
+    /// This becomes [`RenderMeshInstanceGpuQueue::CpuCulling`] or
+    /// [`RenderMeshInstanceGpuQueue::GpuCulling`] once extraction starts.
+    #[default]
+    None,
+    /// The version of [`RenderMeshInstanceGpuQueue`] that omits the
+    /// [`MeshCullingDataGpuBuilder`], so that we don't waste space when GPU
+    /// culling is disabled.
+    CpuCulling(Vec<(Entity, RenderMeshInstanceGpuBuilder)>),
+    /// The version of [`RenderMeshInstanceGpuQueue`] that contains the
+    /// [`MeshCullingDataGpuBuilder`], used when any view has GPU culling
+    /// enabled.
+    GpuCulling(Vec<(Entity, RenderMeshInstanceGpuBuilder, MeshCullingData)>),
+}
+
 impl RenderMeshInstanceShared {
     fn from_components(
         previous_transform: Option<&PreviousGlobalTransform>,
@@ -494,41 +580,147 @@ impl RenderMeshInstances {
     }
 }
 
-pub(crate) trait RenderMeshInstancesTable {
-    /// Returns the ID of the mesh asset attached to the given entity, if any.
-    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>>;
+impl RenderMeshInstancesCpu {
+    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
+        self.get(&entity)
+            .map(|render_mesh_instance| render_mesh_instance.mesh_asset_id)
+    }
 
-    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
-    /// mesh attached.
-    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData>;
+    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+        self.get(&entity)
+            .map(|render_mesh_instance| RenderMeshQueueData {
+                shared: &render_mesh_instance.shared,
+                translation: render_mesh_instance.transforms.transform.translation,
+            })
+    }
 }
 
-impl RenderMeshInstancesTable for RenderMeshInstancesCpu {
+impl RenderMeshInstancesGpu {
     fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
-        self.get(&entity).map(|instance| instance.mesh_asset_id)
+        self.get(&entity)
+            .map(|render_mesh_instance| render_mesh_instance.mesh_asset_id)
     }
 
     fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
-        self.get(&entity).map(|instance| RenderMeshQueueData {
-            shared: &instance.shared,
-            translation: instance.transforms.transform.translation,
-        })
+        self.get(&entity)
+            .map(|render_mesh_instance| RenderMeshQueueData {
+                shared: &render_mesh_instance.shared,
+                translation: render_mesh_instance.translation,
+            })
     }
 }
 
-impl RenderMeshInstancesTable for RenderMeshInstancesGpu {
-    /// Returns the ID of the mesh asset attached to the given entity, if any.
-    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
-        self.get(&entity).map(|instance| instance.mesh_asset_id)
+impl RenderMeshInstanceGpuQueue {
+    /// Clears out a [`RenderMeshInstanceGpuQueue`], creating or recreating it
+    /// as necessary.
+    ///
+    /// `any_gpu_culling` should be set to true if any view has GPU culling
+    /// enabled.
+    fn init(&mut self, any_gpu_culling: bool) {
+        match (any_gpu_culling, &mut *self) {
+            (true, RenderMeshInstanceGpuQueue::GpuCulling(queue)) => queue.clear(),
+            (true, _) => *self = RenderMeshInstanceGpuQueue::GpuCulling(vec![]),
+            (false, RenderMeshInstanceGpuQueue::CpuCulling(queue)) => queue.clear(),
+            (false, _) => *self = RenderMeshInstanceGpuQueue::CpuCulling(vec![]),
+        }
     }
 
-    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
-    /// mesh attached.
-    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
-        self.get(&entity).map(|instance| RenderMeshQueueData {
-            shared: &instance.shared,
-            translation: instance.translation,
-        })
+    /// Adds a new mesh to this queue.
+    fn push(
+        &mut self,
+        entity: Entity,
+        instance_builder: RenderMeshInstanceGpuBuilder,
+        culling_data_builder: Option<MeshCullingData>,
+    ) {
+        match (&mut *self, culling_data_builder) {
+            (&mut RenderMeshInstanceGpuQueue::CpuCulling(ref mut queue), None) => {
+                queue.push((entity, instance_builder));
+            }
+            (
+                &mut RenderMeshInstanceGpuQueue::GpuCulling(ref mut queue),
+                Some(culling_data_builder),
+            ) => {
+                queue.push((entity, instance_builder, culling_data_builder));
+            }
+            (_, None) => {
+                *self = RenderMeshInstanceGpuQueue::CpuCulling(vec![(entity, instance_builder)]);
+            }
+            (_, Some(culling_data_builder)) => {
+                *self = RenderMeshInstanceGpuQueue::GpuCulling(vec![(
+                    entity,
+                    instance_builder,
+                    culling_data_builder,
+                )]);
+            }
+        }
+    }
+}
+
+impl RenderMeshInstanceGpuBuilder {
+    /// Flushes this mesh instance to the [`RenderMeshInstanceGpu`] and
+    /// [`MeshInputUniform`] tables.
+    fn add_to(
+        self,
+        entity: Entity,
+        render_mesh_instances: &mut EntityHashMap<RenderMeshInstanceGpu>,
+        current_input_buffer: &mut BufferVec<MeshInputUniform>,
+    ) -> usize {
+        // Push the mesh input uniform.
+        let current_uniform_index = current_input_buffer.push(MeshInputUniform {
+            transform: self.transform.to_transpose(),
+            lightmap_uv_rect: self.lightmap_uv_rect,
+            flags: self.mesh_flags.bits(),
+            previous_input_index: match self.previous_input_index {
+                Some(previous_input_index) => previous_input_index.into(),
+                None => u32::MAX,
+            },
+        });
+
+        // Record the [`RenderMeshInstance`].
+        render_mesh_instances.insert(
+            entity,
+            RenderMeshInstanceGpu {
+                translation: self.transform.translation,
+                shared: self.shared,
+                current_uniform_index: (current_uniform_index as u32)
+                    .try_into()
+                    .unwrap_or_default(),
+            },
+        );
+
+        current_uniform_index
+    }
+}
+
+impl MeshCullingData {
+    /// Returns a new [`MeshCullingData`] initialized with the given AABB.
+    ///
+    /// If no AABB is provided, an infinitely-large one is conservatively
+    /// chosen.
+    fn new(aabb: Option<&Aabb>) -> Self {
+        match aabb {
+            Some(aabb) => MeshCullingData {
+                aabb_center: aabb.center.extend(0.0),
+                aabb_half_extents: aabb.half_extents.extend(0.0),
+            },
+            None => MeshCullingData {
+                aabb_center: Vec3::ZERO.extend(0.0),
+                aabb_half_extents: Vec3::INFINITY.extend(0.0),
+            },
+        }
+    }
+
+    /// Flushes this mesh instance culling data to the
+    /// [`MeshCullingDataBuffer`].
+    fn add_to(&self, mesh_culling_data_buffer: &mut MeshCullingDataBuffer) -> usize {
+        mesh_culling_data_buffer.push(*self)
+    }
+}
+
+impl Default for MeshCullingDataBuffer {
+    #[inline]
+    fn default() -> Self {
+        Self(BufferVec::new(BufferUsages::STORAGE))
     }
 }
 
@@ -625,8 +817,8 @@ pub fn extract_meshes_for_cpu_building(
 
     render_mesh_instances.clear();
     for queue in render_mesh_instance_queues.iter_mut() {
-        for (k, v) in queue.drain(..) {
-            render_mesh_instances.insert_unique_unchecked(k, v);
+        for (entity, render_mesh_instance) in queue.drain(..) {
+            render_mesh_instances.insert_unique_unchecked(entity, render_mesh_instance);
         }
     }
 }
@@ -641,9 +833,8 @@ pub fn extract_meshes_for_gpu_building(
     mut batched_instance_buffers: ResMut<
         gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
     >,
-    mut render_mesh_instance_queues: Local<
-        Parallel<Vec<(Entity, RenderMeshInstanceShared, MeshInputUniform)>>,
-    >,
+    mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
+    mut render_mesh_instance_queues: Local<Parallel<RenderMeshInstanceGpuQueue>>,
     meshes_query: Extract<
         Query<(
             Entity,
@@ -651,6 +842,7 @@ pub fn extract_meshes_for_gpu_building(
             &GlobalTransform,
             Option<&PreviousGlobalTransform>,
             Option<&Lightmap>,
+            Option<&Aabb>,
             &Handle<Mesh>,
             Has<NotShadowReceiver>,
             Has<TransmittedShadowReceiver>,
@@ -658,25 +850,22 @@ pub fn extract_meshes_for_gpu_building(
             Has<NoAutomaticBatching>,
         )>,
     >,
+    cameras_query: Extract<Query<(), (With<Camera>, With<GpuCulling>)>>,
 ) {
+    let any_gpu_culling = !cameras_query.is_empty();
+    for render_mesh_instance_queue in render_mesh_instance_queues.iter_mut() {
+        render_mesh_instance_queue.init(any_gpu_culling);
+    }
+
     // Collect render mesh instances. Build up the uniform buffer.
     let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances
     else {
         panic!(
-            "`collect_render_mesh_instances_for_gpu_building` should only be called if we're \
-                using GPU `MeshUniform` building"
+            "`extract_meshes_for_gpu_building` should only be called if we're \
+            using GPU `MeshUniform` building"
         );
     };
 
-    let gpu_preprocessing::BatchedInstanceBuffers {
-        ref mut current_input_buffer,
-        ref mut previous_input_buffer,
-        ..
-    } = *batched_instance_buffers;
-
-    // Swap buffers.
-    mem::swap(current_input_buffer, previous_input_buffer);
-
     meshes_query.par_iter().for_each_init(
         || render_mesh_instance_queues.borrow_local_mut(),
         |queue,
@@ -686,6 +875,7 @@ pub fn extract_meshes_for_gpu_building(
             transform,
             previous_transform,
             lightmap,
+            aabb,
             handle,
             not_shadow_receiver,
             transmitted_receiver,
@@ -706,54 +896,92 @@ pub fn extract_meshes_for_gpu_building(
                 no_automatic_batching,
             );
 
-            let previous_input_index = shared
-                .flags
-                .contains(RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM)
-                .then(|| {
-                    render_mesh_instances
-                        .get(&entity)
-                        .map(|render_mesh_instance| {
-                            render_mesh_instance.current_uniform_index.into()
-                        })
-                        .unwrap_or(u32::MAX)
-                })
-                .unwrap_or(u32::MAX);
-
             let lightmap_uv_rect =
                 lightmap::pack_lightmap_uv_rect(lightmap.map(|lightmap| lightmap.uv_rect));
-            let affine3: Affine3 = (&transform.affine()).into();
 
-            queue.push((
-                entity,
+            let gpu_mesh_culling_data = any_gpu_culling.then(|| MeshCullingData::new(aabb));
+
+            let previous_input_index = if shared
+                .flags
+                .contains(RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM)
+            {
+                render_mesh_instances
+                    .get(&entity)
+                    .map(|render_mesh_instance| render_mesh_instance.current_uniform_index)
+            } else {
+                None
+            };
+
+            let gpu_mesh_instance_builder = RenderMeshInstanceGpuBuilder {
                 shared,
-                MeshInputUniform {
-                    flags: mesh_flags.bits(),
-                    lightmap_uv_rect,
-                    transform: affine3.to_transpose(),
-                    previous_input_index,
-                },
-            ));
+                transform: (&transform.affine()).into(),
+                lightmap_uv_rect,
+                mesh_flags,
+                previous_input_index,
+            };
+
+            queue.push(entity, gpu_mesh_instance_builder, gpu_mesh_culling_data);
         },
     );
 
+    collect_meshes_for_gpu_building(
+        render_mesh_instances,
+        &mut batched_instance_buffers,
+        &mut mesh_culling_data_buffer,
+        &mut render_mesh_instance_queues,
+    );
+}
+
+/// Creates the [`RenderMeshInstanceGpu`]s and [`MeshInputUniform`]s when GPU
+/// mesh uniforms are built.
+fn collect_meshes_for_gpu_building(
+    render_mesh_instances: &mut RenderMeshInstancesGpu,
+    batched_instance_buffers: &mut gpu_preprocessing::BatchedInstanceBuffers<
+        MeshUniform,
+        MeshInputUniform,
+    >,
+    mesh_culling_data_buffer: &mut MeshCullingDataBuffer,
+    render_mesh_instance_queues: &mut Parallel<RenderMeshInstanceGpuQueue>,
+) {
+    // Collect render mesh instances. Build up the uniform buffer.
+
+    let gpu_preprocessing::BatchedInstanceBuffers {
+        ref mut current_input_buffer,
+        ref mut previous_input_buffer,
+        ..
+    } = batched_instance_buffers;
+
+    // Swap buffers.
+    mem::swap(current_input_buffer, previous_input_buffer);
+
     // Build the [`RenderMeshInstance`]s and [`MeshInputUniform`]s.
     render_mesh_instances.clear();
+
     for queue in render_mesh_instance_queues.iter_mut() {
-        for (entity, shared, mesh_uniform) in queue.drain(..) {
-            let buffer_index = current_input_buffer.push(mesh_uniform);
-            let translation = vec3(
-                mesh_uniform.transform[0].w,
-                mesh_uniform.transform[1].w,
-                mesh_uniform.transform[2].w,
-            );
-            render_mesh_instances.insert_unique_unchecked(
-                entity,
-                RenderMeshInstanceGpu {
-                    shared,
-                    translation,
-                    current_uniform_index: NonMaxU32::new(buffer_index as u32).unwrap_or_default(),
-                },
-            );
+        match *queue {
+            RenderMeshInstanceGpuQueue::None => {
+                // This can only happen if the queue is empty.
+            }
+            RenderMeshInstanceGpuQueue::CpuCulling(ref mut queue) => {
+                for (entity, mesh_instance_builder) in queue.drain(..) {
+                    mesh_instance_builder.add_to(
+                        entity,
+                        render_mesh_instances,
+                        current_input_buffer,
+                    );
+                }
+            }
+            RenderMeshInstanceGpuQueue::GpuCulling(ref mut queue) => {
+                for (entity, mesh_instance_builder, mesh_culling_builder) in queue.drain(..) {
+                    let instance_data_index = mesh_instance_builder.add_to(
+                        entity,
+                        render_mesh_instances,
+                        current_input_buffer,
+                    );
+                    let culling_data_index = mesh_culling_builder.add_to(mesh_culling_data_buffer);
+                    debug_assert_eq!(instance_data_index, culling_data_index);
+                }
+            }
         }
     }
 }
@@ -876,7 +1104,11 @@ impl MeshPipeline {
 }
 
 impl GetBatchData for MeshPipeline {
-    type Param = (SRes<RenderMeshInstances>, SRes<RenderLightmaps>);
+    type Param = (
+        SRes<RenderMeshInstances>,
+        SRes<RenderLightmaps>,
+        SRes<RenderAssets<GpuMesh>>,
+    );
     // The material bind group ID, the mesh ID, and the lightmap ID,
     // respectively.
     type CompareData = (MaterialBindGroupId, AssetId<Mesh>, Option<AssetId<Image>>);
@@ -884,7 +1116,7 @@ impl GetBatchData for MeshPipeline {
     type BufferData = MeshUniform;
 
     fn get_batch_data(
-        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -915,7 +1147,7 @@ impl GetFullBatchData for MeshPipeline {
     type BufferInputData = MeshInputUniform;
 
     fn get_index_and_compare_data(
-        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
         // This should only be called during GPU building.
@@ -941,7 +1173,7 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_binned_batch_data(
-        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<Self::BufferData> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@@ -960,7 +1192,7 @@ impl GetFullBatchData for MeshPipeline {
     }
 
     fn get_binned_index(
-        (mesh_instances, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, _, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<NonMaxU32> {
         // This should only be called during GPU building.
@@ -976,6 +1208,70 @@ impl GetFullBatchData for MeshPipeline {
             .get(&entity)
             .map(|entity| entity.current_uniform_index)
     }
+
+    fn get_batch_indirect_parameters_index(
+        (mesh_instances, _, meshes): &SystemParamItem<Self::Param>,
+        indirect_parameters_buffer: &mut IndirectParametersBuffer,
+        entity: Entity,
+        instance_index: u32,
+    ) -> Option<NonMaxU32> {
+        get_batch_indirect_parameters_index(
+            mesh_instances,
+            meshes,
+            indirect_parameters_buffer,
+            entity,
+            instance_index,
+        )
+    }
+}
+
+/// Pushes a set of [`IndirectParameters`] onto the [`IndirectParametersBuffer`]
+/// for the given mesh instance, and returns the index of those indirect
+/// parameters.
+fn get_batch_indirect_parameters_index(
+    mesh_instances: &RenderMeshInstances,
+    meshes: &RenderAssets<GpuMesh>,
+    indirect_parameters_buffer: &mut IndirectParametersBuffer,
+    entity: Entity,
+    instance_index: u32,
+) -> Option<NonMaxU32> {
+    // This should only be called during GPU building.
+    let RenderMeshInstances::GpuBuilding(ref mesh_instances) = *mesh_instances else {
+        error!(
+            "`get_batch_indirect_parameters_index` should never be called in CPU mesh uniform \
+                building mode"
+        );
+        return None;
+    };
+
+    let mesh_instance = mesh_instances.get(&entity)?;
+    let mesh = meshes.get(mesh_instance.mesh_asset_id)?;
+
+    // Note that `IndirectParameters` covers both of these structures, even
+    // though they actually have distinct layouts. See the comment above that
+    // type for more information.
+    let indirect_parameters = match mesh.buffer_info {
+        GpuBufferInfo::Indexed {
+            count: index_count, ..
+        } => IndirectParameters {
+            vertex_or_index_count: index_count,
+            instance_count: 0,
+            first_vertex: 0,
+            base_vertex_or_first_instance: 0,
+            first_instance: instance_index,
+        },
+        GpuBufferInfo::NonIndexed => IndirectParameters {
+            vertex_or_index_count: mesh.vertex_count,
+            instance_count: 0,
+            first_vertex: 0,
+            base_vertex_or_first_instance: instance_index,
+            first_instance: instance_index,
+        },
+    };
+
+    (indirect_parameters_buffer.push(indirect_parameters) as u32)
+        .try_into()
+        .ok()
 }
 
 bitflags::bitflags! {
@@ -1666,7 +1962,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
 
         let mut dynamic_offsets: [u32; 3] = Default::default();
         let mut offset_count = 0;
-        if let Some(dynamic_offset) = item.dynamic_offset() {
+        if let Some(dynamic_offset) = item.extra_index().as_dynamic_offset() {
             dynamic_offsets[offset_count] = dynamic_offset.get();
             offset_count += 1;
         }
@@ -1689,8 +1985,9 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
     type Param = (
         SRes<RenderAssets<GpuMesh>>,
         SRes<RenderMeshInstances>,
+        SRes<IndirectParametersBuffer>,
         SRes<PipelineCache>,
-        Option<SRes<PreprocessPipeline>>,
+        Option<SRes<PreprocessPipelines>>,
     );
     type ViewQuery = Has<PreprocessBindGroup>;
     type ItemQuery = ();
@@ -1699,25 +1996,15 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         item: &P,
         has_preprocess_bind_group: ROQueryItem<Self::ViewQuery>,
         _item_query: Option<()>,
-        (meshes, mesh_instances, pipeline_cache, preprocess_pipeline): SystemParamItem<
-            'w,
-            '_,
-            Self::Param,
-        >,
+        (meshes, mesh_instances, indirect_parameters_buffer, pipeline_cache, preprocess_pipelines): SystemParamItem<'w, '_, Self::Param>,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
         // If we're using GPU preprocessing, then we're dependent on that
         // compute shader having been run, which of course can only happen if
         // it's compiled. Otherwise, our mesh instance data won't be present.
-        if let Some(preprocess_pipeline) = preprocess_pipeline {
+        if let Some(preprocess_pipelines) = preprocess_pipelines {
             if !has_preprocess_bind_group
-                || !preprocess_pipeline
-                    .pipeline_id
-                    .is_some_and(|preprocess_pipeline_id| {
-                        pipeline_cache
-                            .get_compute_pipeline(preprocess_pipeline_id)
-                            .is_some()
-                    })
+                || !preprocess_pipelines.pipelines_are_loaded(&pipeline_cache)
             {
                 return RenderCommandResult::Failure;
             }
@@ -1725,6 +2012,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
 
         let meshes = meshes.into_inner();
         let mesh_instances = mesh_instances.into_inner();
+        let indirect_parameters_buffer = indirect_parameters_buffer.into_inner();
 
         let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else {
             return RenderCommandResult::Failure;
@@ -1733,9 +2021,26 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
             return RenderCommandResult::Failure;
         };
 
+        // Calculate the indirect offset, and look up the buffer.
+        let indirect_parameters = match item.extra_index().as_indirect_parameters_index() {
+            None => None,
+            Some(index) => match indirect_parameters_buffer.buffer() {
+                None => {
+                    warn!("Not rendering mesh because indirect parameters buffer wasn't present");
+                    return RenderCommandResult::Failure;
+                }
+                Some(buffer) => Some((
+                    index as u64 * mem::size_of::<IndirectParameters>() as u64,
+                    buffer,
+                )),
+            },
+        };
+
         pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..));
 
         let batch_range = item.batch_range();
+
+        // Draw either directly or indirectly, as appropriate.
         match &gpu_mesh.buffer_info {
             GpuBufferInfo::Indexed {
                 buffer,
@@ -1743,11 +2048,25 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                 count,
             } => {
                 pass.set_index_buffer(buffer.slice(..), 0, *index_format);
-                pass.draw_indexed(0..*count, 0, batch_range.clone());
-            }
-            GpuBufferInfo::NonIndexed => {
-                pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
+                match indirect_parameters {
+                    None => {
+                        pass.draw_indexed(0..*count, 0, batch_range.clone());
+                    }
+                    Some((indirect_parameters_offset, indirect_parameters_buffer)) => pass
+                        .draw_indexed_indirect(
+                            indirect_parameters_buffer,
+                            indirect_parameters_offset,
+                        ),
+                }
             }
+            GpuBufferInfo::NonIndexed => match indirect_parameters {
+                None => {
+                    pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
+                }
+                Some((indirect_parameters_offset, indirect_parameters_buffer)) => {
+                    pass.draw_indirect(indirect_parameters_buffer, indirect_parameters_offset);
+                }
+            },
         }
         RenderCommandResult::Success
     }
diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
index c4adaa5105623..614c9091a6e2a 100644
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -9,6 +9,7 @@
 
 #import bevy_pbr::mesh_types::Mesh
 #import bevy_render::maths
+#import bevy_render::view::View
 
 // Per-frame data that the CPU supplies to the GPU.
 struct MeshInput {
@@ -23,15 +24,45 @@ struct MeshInput {
     previous_input_index: u32,
 }
 
+// Information about each mesh instance needed to cull it on GPU.
+//
+// At the moment, this just consists of its axis-aligned bounding box (AABB).
+struct MeshCullingData {
+    // The 3D center of the AABB in model space, padded with an extra unused
+    // float value.
+    aabb_center: vec4<f32>,
+    // The 3D extents of the AABB in model space, divided by two, padded with
+    // an extra unused float value.
+    aabb_half_extents: vec4<f32>,
+}
+
 // One invocation of this compute shader: i.e. one mesh instance in a view.
 struct PreprocessWorkItem {
     // The index of the `MeshInput` in the `current_input` buffer that we read
     // from.
     input_index: u32,
-    // The index of the `Mesh` in `output` that we write to.
+    // In direct mode, the index of the `Mesh` in `output` that we write to. In
+    // indirect mode, the index of the `IndirectParameters` in
+    // `indirect_parameters` that we write to.
     output_index: u32,
 }
 
+// The `wgpu` indirect parameters structure. This is a union of two structures.
+// For more information, see the corresponding comment in
+// `gpu_preprocessing.rs`.
+struct IndirectParameters {
+    // `vertex_count` or `index_count`.
+    data0: u32,
+    // `instance_count` in both structures.
+    instance_count: atomic<u32>,
+    // `first_vertex` in both structures.
+    first_vertex: u32,
+    // `first_instance` or `base_vertex`.
+    data1: u32,
+    // A read-only copy of `instance_index`.
+    instance_index: u32,
+}
+
 // The current frame's `MeshInput`.
 @group(0) @binding(0) var<storage> current_input: array<MeshInput>;
 // The `MeshInput` values from the previous frame.
@@ -43,20 +74,82 @@ struct PreprocessWorkItem {
 // The output array of `Mesh`es.
 @group(0) @binding(3) var<storage, read_write> output: array<Mesh>;
 
+#ifdef INDIRECT
+// The array of indirect parameters for drawcalls.
+@group(0) @binding(4) var<storage, read_write> indirect_parameters: array<IndirectParameters>;
+#endif
+
+#ifdef FRUSTUM_CULLING
+// Data needed to cull the meshes.
+//
+// At the moment, this consists only of AABBs.
+@group(0) @binding(5) var<storage> mesh_culling_data: array<MeshCullingData>;
+
+// The view data, including the view matrix.
+@group(0) @binding(6) var<uniform> view: View;
+
+// Returns true if the view frustum intersects an oriented bounding box (OBB).
+//
+// `aabb_center.w` should be 1.0.
+fn view_frustum_intersects_obb(
+    model: mat4x4<f32>,
+    aabb_center: vec4<f32>,
+    aabb_half_extents: vec3<f32>,
+) -> bool {
+
+    for (var i = 0; i < 5; i += 1) {
+        // Calculate relative radius of the sphere associated with this plane.
+        let plane_normal = view.frustum[i];
+        let relative_radius = dot(
+            abs(
+                vec3(
+                    dot(plane_normal, model[0]),
+                    dot(plane_normal, model[1]),
+                    dot(plane_normal, model[2]),
+                )
+            ),
+            aabb_half_extents
+        );
+
+        // Check the frustum plane.
+        if (!maths::sphere_intersects_plane_half_space(
+                plane_normal, aabb_center, relative_radius)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+#endif
+
 @compute
 @workgroup_size(64)
 fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
+    // Figure out our instance index. If this thread doesn't correspond to any
+    // index, bail.
     let instance_index = global_invocation_id.x;
     if (instance_index >= arrayLength(&work_items)) {
         return;
     }
 
     // Unpack.
-    let mesh_index = work_items[instance_index].input_index;
+    let input_index = work_items[instance_index].input_index;
     let output_index = work_items[instance_index].output_index;
-    let model_affine_transpose = current_input[mesh_index].model;
+    let model_affine_transpose = current_input[input_index].model;
     let model = maths::affine3_to_square(model_affine_transpose);
 
+    // Cull if necessary.
+#ifdef FRUSTUM_CULLING
+    let aabb_center = mesh_culling_data[input_index].aabb_center.xyz;
+    let aabb_half_extents = mesh_culling_data[input_index].aabb_half_extents.xyz;
+
+    // Do an OBB-based frustum cull.
+    let model_center = model * vec4(aabb_center, 1.0);
+    if (!view_frustum_intersects_obb(model, model_center, aabb_half_extents)) {
+        return;
+    }
+#endif
+
     // Calculate inverse transpose.
     let inverse_transpose_model = transpose(maths::inverse_affine3(transpose(
         model_affine_transpose)));
@@ -68,7 +161,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
     let inverse_transpose_model_b = inverse_transpose_model[2].z;
 
     // Look up the previous model matrix.
-    let previous_input_index = current_input[mesh_index].previous_input_index;
+    let previous_input_index = current_input[input_index].previous_input_index;
     var previous_model: mat3x4<f32>;
     if (previous_input_index == 0xffffffff) {
         previous_model = model_affine_transpose;
@@ -76,11 +169,21 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
         previous_model = previous_input[previous_input_index].model;
     }
 
+    // Figure out the output index. In indirect mode, this involves bumping the
+    // instance index in the indirect parameters structure. Otherwise, this
+    // index was directly supplied to us.
+#ifdef INDIRECT
+    let mesh_output_index = indirect_parameters[output_index].instance_index +
+        atomicAdd(&indirect_parameters[output_index].instance_count, 1u);
+#else
+    let mesh_output_index = output_index;
+#endif
+
     // Write the output.
-    output[output_index].model = model_affine_transpose;
-    output[output_index].previous_model = previous_model;
-    output[output_index].inverse_transpose_model_a = inverse_transpose_model_a;
-    output[output_index].inverse_transpose_model_b = inverse_transpose_model_b;
-    output[output_index].flags = current_input[mesh_index].flags;
-    output[output_index].lightmap_uv_rect = current_input[mesh_index].lightmap_uv_rect;
+    output[mesh_output_index].model = model_affine_transpose;
+    output[mesh_output_index].previous_model = previous_model;
+    output[mesh_output_index].inverse_transpose_model_a = inverse_transpose_model_a;
+    output[mesh_output_index].inverse_transpose_model_b = inverse_transpose_model_b;
+    output[mesh_output_index].flags = current_input[input_index].flags;
+    output[mesh_output_index].lightmap_uv_rect = current_input[input_index].lightmap_uv_rect;
 }
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 22bebd8bf94e9..661ff7f301ed9 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -1,29 +1,74 @@
 //! Batching functionality when GPU preprocessing is in use.
 
-use std::marker::PhantomData;
-
+use bevy_app::{App, Plugin};
+use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::{
     entity::Entity,
-    query::With,
+    query::{Has, With},
+    schedule::IntoSystemConfigs as _,
     system::{Query, Res, ResMut, Resource, StaticSystemParam},
+    world::{FromWorld, World},
 };
 use bevy_encase_derive::ShaderType;
 use bevy_utils::EntityHashMap;
 use bytemuck::{Pod, Zeroable};
+use nonmax::NonMaxU32;
 use smallvec::smallvec;
-use wgpu::{BindingResource, BufferUsages};
+use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features};
 
 use crate::{
     render_phase::{
         BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        SortedPhaseItem, SortedRenderPhase,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, UnbatchableBinnedEntityIndices,
     },
-    render_resource::{BufferVec, GpuArrayBufferIndex, GpuArrayBufferable, UninitBufferVec},
-    renderer::{RenderDevice, RenderQueue},
-    view::ViewTarget,
+    render_resource::{BufferVec, GpuArrayBufferable, UninitBufferVec},
+    renderer::{RenderAdapter, RenderDevice, RenderQueue},
+    view::{GpuCulling, ViewTarget},
+    Render, RenderApp, RenderSet,
 };
 
-use super::GetFullBatchData;
+use super::{BatchMeta, GetBatchData, GetFullBatchData};
+
+pub struct BatchingPlugin;
+
+impl Plugin for BatchingPlugin {
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.add_systems(
+            Render,
+            write_indirect_parameters_buffer.in_set(RenderSet::PrepareResourcesFlush),
+        );
+    }
+
+    fn finish(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.init_resource::<GpuPreprocessingSupport>();
+    }
+}
+
+/// Records whether GPU preprocessing and/or GPU culling are supported on the
+/// device.
+///
+/// No GPU preprocessing is supported on WebGL because of the lack of compute
+/// shader support.  GPU preprocessing is supported on DirectX 12, but due to [a
+/// `wgpu` limitation] GPU culling is not.
+///
+/// [a `wgpu` limitation]: https://github.com/gfx-rs/wgpu/issues/2471
+#[derive(Clone, Copy, PartialEq, Resource)]
+pub enum GpuPreprocessingSupport {
+    /// No GPU preprocessing support is available at all.
+    None,
+    /// GPU preprocessing is available, but GPU culling isn't.
+    PreprocessingOnly,
+    /// Both GPU preprocessing and GPU culling are available.
+    Culling,
+}
 
 /// The GPU buffers holding the data needed to render batches.
 ///
@@ -51,7 +96,7 @@ where
     /// corresponds to each instance.
     ///
     /// This is keyed off each view. Each view has a separate buffer.
-    pub work_item_buffers: EntityHashMap<Entity, BufferVec<PreprocessWorkItem>>,
+    pub work_item_buffers: EntityHashMap<Entity, PreprocessWorkItemBuffer>,
 
     /// The uniform data inputs for the current frame.
     ///
@@ -68,6 +113,14 @@ where
     pub previous_input_buffer: BufferVec<BDI>,
 }
 
+/// The buffer of GPU preprocessing work items for a single view.
+pub struct PreprocessWorkItemBuffer {
+    /// The buffer of work items.
+    pub buffer: BufferVec<PreprocessWorkItem>,
+    /// True if we're using GPU culling.
+    pub gpu_culling: bool,
+}
+
 /// One invocation of the preprocessing shader: i.e. one mesh instance in a
 /// view.
 #[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
@@ -76,10 +129,114 @@ pub struct PreprocessWorkItem {
     /// The index of the batch input data in the input buffer that the shader
     /// reads from.
     pub input_index: u32,
-    /// The index of the `MeshUniform` in the output buffer that we write to.
+    /// In direct mode, this is the index of the `MeshUniform` in the output
+    /// buffer that we write to. In indirect mode, this is the index of the
+    /// [`IndirectParameters`].
     pub output_index: u32,
 }
 
+/// The `wgpu` indirect parameters structure.
+///
+/// This is actually a union of the two following structures:
+///
+/// ```
+/// #[repr(C)]
+/// struct ArrayIndirectParameters {
+///     vertex_count: u32,
+///     instance_count: u32,
+///     first_vertex: u32,
+///     first_instance: u32,
+/// }
+///
+/// #[repr(C)]
+/// struct ElementIndirectParameters {
+///     index_count: u32,
+///     instance_count: u32,
+///     first_vertex: u32,
+///     base_vertex: u32,
+///     first_instance: u32,
+/// }
+/// ```
+///
+/// We actually generally treat these two variants identically in code. To do
+/// that, we make the following two observations:
+///
+/// 1. `instance_count` is in the same place in both structures. So we can
+/// access it regardless of the structure we're looking at.
+///
+/// 2. The second structure is one word larger than the first. Thus we need to
+/// pad out the first structure by one word in order to place both structures in
+/// an array. If we pad out `ArrayIndirectParameters` by copying the
+/// `first_instance` field into the padding, then the resulting union structure
+/// will always have a read-only copy of `first_instance` in the final word. We
+/// take advantage of this in the shader to reduce branching.
+#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
+#[repr(C)]
+pub struct IndirectParameters {
+    /// For `ArrayIndirectParameters`, `vertex_count`; for
+    /// `ElementIndirectParameters`, `index_count`.
+    pub vertex_or_index_count: u32,
+
+    /// The number of instances we're going to draw.
+    ///
+    /// This field is in the same place in both structures.
+    pub instance_count: u32,
+
+    /// The index of the first vertex we're to draw.
+    pub first_vertex: u32,
+
+    /// For `ArrayIndirectParameters`, `first_instance`; for
+    /// `ElementIndirectParameters`, `base_vertex`.`
+    pub base_vertex_or_first_instance: u32,
+
+    /// For `ArrayIndirectParameters`, this is padding; for
+    /// `ElementIndirectParameters`, this is `first_instance`.
+    ///
+    /// Conventionally, we copy `first_instance` into this field when padding
+    /// out `ArrayIndirectParameters`. That way, shader code can read this value
+    /// at the same place, regardless of the specific structure this represents.
+    pub first_instance: u32,
+}
+
+/// The buffer containing the list of [`IndirectParameters`], for draw commands.
+#[derive(Resource, Deref, DerefMut)]
+pub struct IndirectParametersBuffer(pub BufferVec<IndirectParameters>);
+
+impl IndirectParametersBuffer {
+    /// Creates the indirect parameters buffer.
+    pub fn new() -> IndirectParametersBuffer {
+        IndirectParametersBuffer(BufferVec::new(
+            BufferUsages::STORAGE | BufferUsages::INDIRECT,
+        ))
+    }
+}
+
+impl Default for IndirectParametersBuffer {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl FromWorld for GpuPreprocessingSupport {
+    fn from_world(world: &mut World) -> Self {
+        let adapter = world.resource::<RenderAdapter>();
+        let device = world.resource::<RenderDevice>();
+
+        if device.limits().max_compute_workgroup_size_x == 0 {
+            GpuPreprocessingSupport::None
+        } else if !device
+            .features()
+            .contains(Features::INDIRECT_FIRST_INSTANCE) ||
+            !adapter.get_downlevel_capabilities().flags.contains(
+        DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW)
+        {
+            GpuPreprocessingSupport::PreprocessingOnly
+        } else {
+            GpuPreprocessingSupport::Culling
+        }
+    }
+}
+
 impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
 where
     BD: GpuArrayBufferable + Sync + Send + 'static,
@@ -110,7 +267,7 @@ where
         self.current_input_buffer.clear();
         self.previous_input_buffer.clear();
         for work_item_buffer in self.work_item_buffers.values_mut() {
-            work_item_buffer.clear();
+            work_item_buffer.buffer.clear();
         }
     }
 }
@@ -125,6 +282,51 @@ where
     }
 }
 
+/// Information about a render batch that we're building up during a sorted
+/// render phase.
+struct SortedRenderBatch<F>
+where
+    F: GetBatchData,
+{
+    /// The index of the first phase item in this batch in the list of phase
+    /// items.
+    phase_item_start_index: u32,
+
+    /// The index of the first instance in this batch in the instance buffer.
+    instance_start_index: u32,
+
+    /// The index of the indirect parameters for this batch in the
+    /// [`IndirectParametersBuffer`].
+    ///
+    /// If CPU culling is being used, then this will be `None`.
+    indirect_parameters_index: Option<NonMaxU32>,
+
+    /// Metadata that can be used to determine whether an instance can be placed
+    /// into this batch.
+    meta: BatchMeta<F::CompareData>,
+}
+
+impl<F> SortedRenderBatch<F>
+where
+    F: GetBatchData,
+{
+    /// Finalizes this batch and updates the [`SortedRenderPhase`] with the
+    /// appropriate indices.
+    ///
+    /// `instance_end_index` is the index of the last instance in this batch
+    /// plus one.
+    fn flush<I>(self, instance_end_index: u32, phase: &mut SortedRenderPhase<I>)
+    where
+        I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    {
+        let (batch_range, batch_extra_index) =
+            phase.items[self.phase_item_start_index as usize].batch_range_and_extra_index_mut();
+        *batch_range = self.instance_start_index..instance_end_index;
+        *batch_extra_index =
+            PhaseItemExtraIndex::maybe_indirect_parameters_index(self.indirect_parameters_index);
+    }
+}
+
 /// A system that runs early in extraction and clears out all the
 /// [`BatchedInstanceBuffers`] for the frame.
 ///
@@ -166,53 +368,126 @@ pub fn delete_old_work_item_buffers<GFBD>(
 /// is in use. This means comparing metadata needed to draw each phase item and
 /// trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
-    gpu_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-    >,
-    mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
-    param: StaticSystemParam<GFBD::Param>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
+    mut views: Query<(Entity, &mut SortedRenderPhase<I>, Has<GpuCulling>)>,
+    system_param_item: StaticSystemParam<GFBD::Param>,
 ) where
     I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
     GFBD: GetFullBatchData,
 {
-    let system_param_item = param.into_inner();
-
     // We only process GPU-built batch data in this function.
     let BatchedInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
         ..
-    } = gpu_batched_instance_buffers.into_inner();
+    } = gpu_array_buffer.into_inner();
 
-    for (view, mut phase) in &mut views {
+    for (view, mut phase, gpu_culling) in &mut views {
         // Create the work item buffer if necessary.
-        let work_item_buffer = work_item_buffers
-            .entry(view)
-            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+        let work_item_buffer =
+            work_item_buffers
+                .entry(view)
+                .or_insert_with(|| PreprocessWorkItemBuffer {
+                    buffer: BufferVec::new(BufferUsages::STORAGE),
+                    gpu_culling,
+                });
 
-        super::batch_and_prepare_sorted_render_phase::<I, GFBD>(&mut phase, |item| {
-            let (input_index, compare_data) =
-                GFBD::get_index_and_compare_data(&system_param_item, item.entity())?;
-            let output_index = data_buffer.add() as u32;
+        // Walk through the list of phase items, building up batches as we go.
+        let mut batch: Option<SortedRenderBatch<GFBD>> = None;
+        for current_index in 0..phase.items.len() {
+            // Get the index of the input data, and comparison metadata, for
+            // this entity.
+            let current_batch_input_index = GFBD::get_index_and_compare_data(
+                &system_param_item,
+                phase.items[current_index].entity(),
+            );
+
+            // Unpack that index and metadata. Note that it's possible for index
+            // and/or metadata to not be present, which signifies that this
+            // entity is unbatchable. In that case, we break the batch here and
+            // otherwise ignore the phase item.
+            let (current_input_index, current_meta);
+            match current_batch_input_index {
+                Some((input_index, Some(current_compare_data))) => {
+                    current_input_index = Some(input_index);
+                    current_meta = Some(BatchMeta::new(
+                        &phase.items[current_index],
+                        current_compare_data,
+                    ));
+                }
+                _ => {
+                    current_input_index = None;
+                    current_meta = None;
+                }
+            }
 
-            work_item_buffer.push(PreprocessWorkItem {
-                input_index: input_index.into(),
-                output_index,
+            // Determine if this entity can be included in the batch we're
+            // building up.
+            let can_batch = batch.as_ref().is_some_and(|batch| {
+                current_meta
+                    .as_ref()
+                    .is_some_and(|current_meta| batch.meta == *current_meta)
             });
 
-            *item.batch_range_mut() = output_index..output_index + 1;
+            // Make space in the data buffer for this instance.
+            let current_entity = phase.items[current_index].entity();
+            let output_index = data_buffer.add() as u32;
+
+            // If we can't batch, break the existing batch and make a new one.
+            if !can_batch {
+                // Break a batch if we need to.
+                if let Some(batch) = batch.take() {
+                    batch.flush(output_index, &mut phase);
+                }
+
+                // Start a new batch.
+                batch = current_meta.map(|meta| {
+                    let indirect_parameters_index = if gpu_culling {
+                        GFBD::get_batch_indirect_parameters_index(
+                            &system_param_item,
+                            &mut indirect_parameters_buffer,
+                            current_entity,
+                            output_index,
+                        )
+                    } else {
+                        None
+                    };
+                    SortedRenderBatch {
+                        phase_item_start_index: current_index as u32,
+                        instance_start_index: output_index,
+                        indirect_parameters_index,
+                        meta,
+                    }
+                });
+            }
 
-            compare_data
-        });
+            // Add a new preprocessing work item so that the preprocessing
+            // shader will copy the per-instance data over.
+            if let (Some(batch), Some(input_index)) = (batch.as_ref(), current_input_index.as_ref())
+            {
+                work_item_buffer.buffer.push(PreprocessWorkItem {
+                    input_index: (*input_index).into(),
+                    output_index: match batch.indirect_parameters_index {
+                        Some(indirect_parameters_index) => indirect_parameters_index.into(),
+                        None => output_index,
+                    },
+                });
+            }
+        }
+
+        // Flush the final batch if necessary.
+        if let Some(batch) = batch.take() {
+            batch.flush(data_buffer.len() as u32, &mut phase);
+        }
     }
 }
 
 /// Creates batches for a render phase that uses bins.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    gpu_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-    >,
-    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
+    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>, Has<GpuCulling>)>,
     param: StaticSystemParam<GFBD::Param>,
 ) where
     BPI: BinnedPhaseItem,
@@ -224,16 +499,20 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
         ref mut data_buffer,
         ref mut work_item_buffers,
         ..
-    } = gpu_batched_instance_buffers.into_inner();
+    } = gpu_array_buffer.into_inner();
 
-    for (view, mut phase) in &mut views {
+    for (view, mut phase, gpu_culling) in &mut views {
         let phase = &mut *phase; // Borrow checker.
 
         // Create the work item buffer if necessary; otherwise, just mark it as
         // used this frame.
-        let work_item_buffer = work_item_buffers
-            .entry(view)
-            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+        let work_item_buffer =
+            work_item_buffers
+                .entry(view)
+                .or_insert_with(|| PreprocessWorkItemBuffer {
+                    buffer: BufferVec::new(BufferUsages::STORAGE),
+                    gpu_culling,
+                });
 
         // Prepare batchables.
 
@@ -245,19 +524,50 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 };
                 let output_index = data_buffer.add() as u32;
 
-                work_item_buffer.push(PreprocessWorkItem {
-                    input_index: input_index.into(),
-                    output_index,
-                });
-
-                batch
-                    .get_or_insert(BinnedRenderPhaseBatch {
-                        representative_entity: entity,
-                        instance_range: output_index..output_index,
-                        dynamic_offset: None,
-                    })
-                    .instance_range
-                    .end = output_index + 1;
+                match batch {
+                    Some(ref mut batch) => {
+                        batch.instance_range.end = output_index + 1;
+                        work_item_buffer.buffer.push(PreprocessWorkItem {
+                            input_index: input_index.into(),
+                            output_index: batch
+                                .extra_index
+                                .as_indirect_parameters_index()
+                                .unwrap_or(output_index),
+                        });
+                    }
+
+                    None if gpu_culling => {
+                        let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
+                            &system_param_item,
+                            &mut indirect_parameters_buffer,
+                            entity,
+                            output_index,
+                        );
+                        work_item_buffer.buffer.push(PreprocessWorkItem {
+                            input_index: input_index.into(),
+                            output_index: indirect_parameters_index.unwrap_or_default().into(),
+                        });
+                        batch = Some(BinnedRenderPhaseBatch {
+                            representative_entity: entity,
+                            instance_range: output_index..output_index + 1,
+                            extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(
+                                indirect_parameters_index,
+                            ),
+                        });
+                    }
+
+                    None => {
+                        work_item_buffer.buffer.push(PreprocessWorkItem {
+                            input_index: input_index.into(),
+                            output_index,
+                        });
+                        batch = Some(BinnedRenderPhaseBatch {
+                            representative_entity: entity,
+                            instance_range: output_index..output_index + 1,
+                            extra_index: PhaseItemExtraIndex::NONE,
+                        });
+                    }
+                }
             }
 
             if let Some(batch) = batch {
@@ -274,18 +584,38 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 };
                 let output_index = data_buffer.add() as u32;
 
-                work_item_buffer.push(PreprocessWorkItem {
-                    input_index: input_index.into(),
-                    output_index,
-                });
-
-                unbatchables
-                    .buffer_indices
-                    .add(GpuArrayBufferIndex::<GFBD::BufferData> {
-                        index: output_index,
-                        dynamic_offset: None,
-                        element_type: PhantomData,
+                if gpu_culling {
+                    let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
+                        &system_param_item,
+                        &mut indirect_parameters_buffer,
+                        entity,
+                        output_index,
+                    )
+                    .unwrap_or_default();
+                    work_item_buffer.buffer.push(PreprocessWorkItem {
+                        input_index: input_index.into(),
+                        output_index: indirect_parameters_index.into(),
+                    });
+                    unbatchables
+                        .buffer_indices
+                        .add(UnbatchableBinnedEntityIndices {
+                            instance_index: indirect_parameters_index.into(),
+                            extra_index: PhaseItemExtraIndex::indirect_parameters_index(
+                                indirect_parameters_index.into(),
+                            ),
+                        });
+                } else {
+                    work_item_buffer.buffer.push(PreprocessWorkItem {
+                        input_index: input_index.into(),
+                        output_index,
                     });
+                    unbatchables
+                        .buffer_indices
+                        .add(UnbatchableBinnedEntityIndices {
+                            instance_index: output_index,
+                            extra_index: PhaseItemExtraIndex::NONE,
+                        });
+                }
             }
         }
     }
@@ -295,29 +625,34 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 pub fn write_batched_instance_buffers<GFBD>(
     render_device: Res<RenderDevice>,
     render_queue: Res<RenderQueue>,
-    mut gpu_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-    >,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
 ) where
     GFBD: GetFullBatchData,
 {
-    gpu_batched_instance_buffers
-        .data_buffer
-        .write_buffer(&render_device);
-    gpu_batched_instance_buffers
-        .current_input_buffer
-        .write_buffer(&render_device, &render_queue);
+    let BatchedInstanceBuffers {
+        ref mut data_buffer,
+        work_item_buffers: ref mut index_buffers,
+        ref mut current_input_buffer,
+        previous_input_buffer: _,
+    } = gpu_array_buffer.into_inner();
+
+    data_buffer.write_buffer(&render_device);
+    current_input_buffer.write_buffer(&render_device, &render_queue);
     // There's no need to write `previous_input_buffer`, as we wrote
     // that on the previous frame, and it hasn't changed.
 
-    for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() {
-        work_item_buffer.write_buffer(&render_device, &render_queue);
+    for index_buffer in index_buffers.values_mut() {
+        index_buffer
+            .buffer
+            .write_buffer(&render_device, &render_queue);
     }
 }
 
-/// Determines whether it's possible to run preprocessing on the GPU.
-///
-/// Currently, this simply checks to see whether compute shaders are supported.
-pub fn can_preprocess_on_gpu(render_device: &RenderDevice) -> bool {
-    render_device.limits().max_compute_workgroup_size_x > 0
+pub fn write_indirect_parameters_buffer(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
+) {
+    indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
+    indirect_parameters_buffer.clear();
 }
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 3ce9aaf38bb10..0e1a6ada7ceb6 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -14,6 +14,8 @@ use crate::{
     render_resource::{CachedRenderPipelineId, GpuArrayBufferable},
 };
 
+use self::gpu_preprocessing::IndirectParametersBuffer;
+
 pub mod gpu_preprocessing;
 pub mod no_gpu_preprocessing;
 
@@ -52,7 +54,7 @@ impl<T: PartialEq> BatchMeta<T> {
         BatchMeta {
             pipeline_id: item.cached_pipeline(),
             draw_function_id: item.draw_function(),
-            dynamic_offset: item.dynamic_offset(),
+            dynamic_offset: item.extra_index().as_dynamic_offset(),
             user_data,
         }
     }
@@ -133,6 +135,19 @@ pub trait GetFullBatchData: GetBatchData {
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<NonMaxU32>;
+
+    /// Pushes [`gpu_preprocessing::IndirectParameters`] necessary to draw this
+    /// batch onto the given [`IndirectParametersBuffer`], and returns its
+    /// index.
+    ///
+    /// This is only used if GPU culling is enabled (which requires GPU
+    /// preprocessing).
+    fn get_batch_indirect_parameters_index(
+        param: &SystemParamItem<Self::Param>,
+        indirect_parameters_buffer: &mut IndirectParametersBuffer,
+        entity: Entity,
+        instance_index: u32,
+    ) -> Option<NonMaxU32>;
 }
 
 /// Sorts a render phase that uses bins.
diff --git a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
index 3387243d13dbd..15dfa7842a009 100644
--- a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
@@ -8,7 +8,7 @@ use wgpu::BindingResource;
 use crate::{
     render_phase::{
         BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        SortedPhaseItem, SortedRenderPhase,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
     },
     render_resource::{GpuArrayBuffer, GpuArrayBufferable},
     renderer::{RenderDevice, RenderQueue},
@@ -79,8 +79,9 @@ pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
             let buffer_index = batched_instance_buffer.push(buffer_data);
 
             let index = buffer_index.index;
-            *item.batch_range_mut() = index..index + 1;
-            *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
+            let (batch_range, extra_index) = item.batch_range_and_extra_index_mut();
+            *batch_range = index..index + 1;
+            *extra_index = PhaseItemExtraIndex::maybe_dynamic_offset(buffer_index.dynamic_offset);
 
             compare_data
         });
@@ -90,13 +91,14 @@ pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
 /// Creates batches for a render phase that uses bins, when GPU batch data
 /// building isn't in use.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    mut buffer: ResMut<BatchedInstanceBuffer<GFBD::BufferData>>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffer<GFBD::BufferData>>,
     mut views: Query<&mut BinnedRenderPhase<BPI>>,
     param: StaticSystemParam<GFBD::Param>,
 ) where
     BPI: BinnedPhaseItem,
     GFBD: GetFullBatchData,
 {
+    let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
     for mut phase in &mut views {
@@ -111,7 +113,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 else {
                     continue;
                 };
-                let instance = buffer.push(buffer_data);
+                let instance = gpu_array_buffer.push(buffer_data);
 
                 // If the dynamic offset has changed, flush the batch.
                 //
@@ -120,12 +122,15 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 // with no storage buffers.
                 if !batch_set.last().is_some_and(|batch| {
                     batch.instance_range.end == instance.index
-                        && batch.dynamic_offset == instance.dynamic_offset
+                        && batch.extra_index
+                            == PhaseItemExtraIndex::maybe_dynamic_offset(instance.dynamic_offset)
                 }) {
                     batch_set.push(BinnedRenderPhaseBatch {
                         representative_entity: entity,
                         instance_range: instance.index..instance.index,
-                        dynamic_offset: instance.dynamic_offset,
+                        extra_index: PhaseItemExtraIndex::maybe_dynamic_offset(
+                            instance.dynamic_offset,
+                        ),
                     });
                 }
 
@@ -145,8 +150,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 else {
                     continue;
                 };
-                let instance = buffer.push(buffer_data);
-                unbatchables.buffer_indices.add(instance);
+                let instance = gpu_array_buffer.push(buffer_data);
+                unbatchables.buffer_indices.add(instance.into());
             }
         }
     }
diff --git a/crates/bevy_render/src/camera/camera.rs b/crates/bevy_render/src/camera/camera.rs
index 215aa2bb3911c..4d0f334f01d49 100644
--- a/crates/bevy_render/src/camera/camera.rs
+++ b/crates/bevy_render/src/camera/camera.rs
@@ -1,4 +1,5 @@
 use crate::{
+    batching::gpu_preprocessing::GpuPreprocessingSupport,
     camera::{CameraProjection, ManualTextureViewHandle, ManualTextureViews},
     prelude::Image,
     primitives::Frustum,
@@ -6,7 +7,9 @@ use crate::{
     render_graph::{InternedRenderSubGraph, RenderSubGraph},
     render_resource::TextureView,
     texture::GpuImage,
-    view::{ColorGrading, ExtractedView, ExtractedWindows, RenderLayers, VisibleEntities},
+    view::{
+        ColorGrading, ExtractedView, ExtractedWindows, GpuCulling, RenderLayers, VisibleEntities,
+    },
     Extract,
 };
 use bevy_asset::{AssetEvent, AssetId, Assets, Handle};
@@ -17,6 +20,7 @@ use bevy_ecs::{
     entity::Entity,
     event::EventReader,
     prelude::With,
+    query::Has,
     reflect::ReflectComponent,
     system::{Commands, Query, Res, ResMut, Resource},
 };
@@ -24,7 +28,7 @@ use bevy_math::{vec2, Dir3, Mat4, Ray3d, Rect, URect, UVec2, UVec4, Vec2, Vec3};
 use bevy_reflect::prelude::*;
 use bevy_render_macros::ExtractComponent;
 use bevy_transform::components::GlobalTransform;
-use bevy_utils::tracing::warn;
+use bevy_utils::{tracing::warn, warn_once};
 use bevy_utils::{HashMap, HashSet};
 use bevy_window::{
     NormalizedWindowRef, PrimaryWindow, Window, WindowCreated, WindowRef, WindowResized,
@@ -827,9 +831,11 @@ pub fn extract_cameras(
             Option<&TemporalJitter>,
             Option<&RenderLayers>,
             Option<&Projection>,
+            Has<GpuCulling>,
         )>,
     >,
     primary_window: Extract<Query<Entity, With<PrimaryWindow>>>,
+    gpu_preprocessing_support: Res<GpuPreprocessingSupport>,
 ) {
     let primary_window = primary_window.iter().next();
     for (
@@ -844,6 +850,7 @@ pub fn extract_cameras(
         temporal_jitter,
         render_layers,
         projection,
+        gpu_culling,
     ) in query.iter()
     {
         let color_grading = *color_grading.unwrap_or(&ColorGrading::default());
@@ -915,6 +922,16 @@ pub fn extract_cameras(
             if let Some(perspective) = projection {
                 commands.insert(perspective.clone());
             }
+
+            if gpu_culling {
+                if *gpu_preprocessing_support == GpuPreprocessingSupport::Culling {
+                    commands.insert(GpuCulling);
+                } else {
+                    warn_once!(
+                        "GPU culling isn't supported on this platform; ignoring `GpuCulling`."
+                    );
+                }
+            }
         }
     }
 }
diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs
index 1ba6d9cbe371e..60d8ef648aa77 100644
--- a/crates/bevy_render/src/lib.rs
+++ b/crates/bevy_render/src/lib.rs
@@ -52,6 +52,7 @@ pub mod prelude {
     };
 }
 
+use batching::gpu_preprocessing::BatchingPlugin;
 use bevy_ecs::schedule::ScheduleBuildSettings;
 use bevy_utils::prelude::default;
 pub use extract_param::Extract;
@@ -334,6 +335,7 @@ impl Plugin for RenderPlugin {
             MeshPlugin,
             GlobalsPlugin,
             MorphPlugin,
+            BatchingPlugin,
         ));
 
         app.init_resource::<RenderAssetBytesPerFrame>()
diff --git a/crates/bevy_render/src/maths.wgsl b/crates/bevy_render/src/maths.wgsl
index 4070a8679a5b8..720e6bac46a08 100644
--- a/crates/bevy_render/src/maths.wgsl
+++ b/crates/bevy_render/src/maths.wgsl
@@ -52,6 +52,11 @@ fn inverse_affine3(affine: mat4x3<f32>) -> mat4x3<f32> {
     return mat4x3<f32>(inv_matrix3[0], inv_matrix3[1], inv_matrix3[2], -(inv_matrix3 * affine[3]));
 }
 
+// Extracts the upper 3x3 portion of a 4x4 matrix.
+fn mat4x4_to_mat3x3(m: mat4x4<f32>) -> mat3x3<f32> {
+    return mat3x3<f32>(m[0].xyz, m[1].xyz, m[2].xyz);
+}
+
 // Creates an orthonormal basis given a Z vector and an up vector (which becomes
 // Y after orthonormalization).
 //
@@ -64,3 +69,16 @@ fn orthonormalize(z_unnormalized: vec3<f32>, up: vec3<f32>) -> mat3x3<f32> {
     let y_basis = cross(z_basis, x_basis);
     return mat3x3(x_basis, y_basis, z_basis);
 }
+
+// Returns true if any part of a sphere is on the positive side of a plane.
+//
+// `sphere_center.w` should be 1.0.
+//
+// This is used for frustum culling.
+fn sphere_intersects_plane_half_space(
+    plane: vec4<f32>,
+    sphere_center: vec4<f32>,
+    sphere_radius: f32
+) -> bool {
+    return dot(plane, sphere_center) + sphere_radius > 0.0;
+}
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index 58e1fa550f4e2..a1125fa938e2c 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -51,7 +51,14 @@ use bevy_ecs::{
     system::{lifetimeless::SRes, SystemParamItem},
 };
 use smallvec::SmallVec;
-use std::{hash::Hash, marker::PhantomData, ops::Range, slice::SliceIndex};
+use std::{
+    fmt::{self, Debug, Formatter},
+    hash::Hash,
+    iter,
+    marker::PhantomData,
+    ops::Range,
+    slice::SliceIndex,
+};
 
 /// A collection of all rendering instructions, that will be executed by the GPU, for a
 /// single render phase for a single view.
@@ -124,7 +131,7 @@ pub struct BinnedRenderPhaseBatch {
     ///
     /// Note that dynamic offsets are only used on platforms that don't support
     /// storage buffers.
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 /// Information about the unbatchable entities in a bin.
@@ -133,7 +140,7 @@ pub(crate) struct UnbatchableBinnedEntities {
     pub(crate) entities: Vec<Entity>,
 
     /// The GPU array buffer indices of each unbatchable binned entity.
-    pub(crate) buffer_indices: UnbatchableBinnedEntityBufferIndex,
+    pub(crate) buffer_indices: UnbatchableBinnedEntityIndexSet,
 }
 
 /// Stores instance indices and dynamic offsets for unbatchable entities in a
@@ -145,7 +152,7 @@ pub(crate) struct UnbatchableBinnedEntities {
 /// platforms that aren't WebGL 2.
 #[derive(Default)]
 
-pub(crate) enum UnbatchableBinnedEntityBufferIndex {
+pub(crate) enum UnbatchableBinnedEntityIndexSet {
     /// There are no unbatchable entities in this bin (yet).
     #[default]
     NoEntities,
@@ -155,26 +162,42 @@ pub(crate) enum UnbatchableBinnedEntityBufferIndex {
     ///
     /// This is the typical case on platforms other than WebGL 2. We special
     /// case this to avoid allocation on those platforms.
-    NoDynamicOffsets {
+    Sparse {
         /// The range of indices.
         instance_range: Range<u32>,
+        /// The index of the first indirect instance parameters.
+        ///
+        /// The other indices immediately follow these.
+        first_indirect_parameters_index: Option<NonMaxU32>,
     },
 
     /// Dynamic uniforms are present for unbatchable entities in this bin.
     ///
     /// We fall back to this on WebGL 2.
-    DynamicOffsets(Vec<UnbatchableBinnedEntityDynamicOffset>),
+    Dense(Vec<UnbatchableBinnedEntityIndices>),
 }
 
 /// The instance index and dynamic offset (if present) for an unbatchable entity.
 ///
 /// This is only useful on platforms that don't support storage buffers.
 #[derive(Clone, Copy)]
-pub(crate) struct UnbatchableBinnedEntityDynamicOffset {
+pub(crate) struct UnbatchableBinnedEntityIndices {
     /// The instance index.
-    instance_index: u32,
-    /// The dynamic offset, if present.
-    dynamic_offset: Option<NonMaxU32>,
+    pub(crate) instance_index: u32,
+    /// The [`PhaseItemExtraIndex`], if present.
+    pub(crate) extra_index: PhaseItemExtraIndex,
+}
+
+impl<T> From<GpuArrayBufferIndex<T>> for UnbatchableBinnedEntityIndices
+where
+    T: Clone + ShaderSize + WriteInto,
+{
+    fn from(value: GpuArrayBufferIndex<T>) -> Self {
+        UnbatchableBinnedEntityIndices {
+            instance_index: value.index,
+            extra_index: PhaseItemExtraIndex::maybe_dynamic_offset(value.dynamic_offset),
+        }
+    }
 }
 
 impl<BPI> BinnedRenderPhase<BPI>
@@ -227,7 +250,7 @@ where
                     key.clone(),
                     batch.representative_entity,
                     batch.instance_range.clone(),
-                    batch.dynamic_offset,
+                    batch.extra_index,
                 );
 
                 // Fetch the draw function.
@@ -246,17 +269,26 @@ where
             let unbatchable_entities = &self.unbatchable_values[key];
             for (entity_index, &entity) in unbatchable_entities.entities.iter().enumerate() {
                 let unbatchable_dynamic_offset = match &unbatchable_entities.buffer_indices {
-                    UnbatchableBinnedEntityBufferIndex::NoEntities => {
+                    UnbatchableBinnedEntityIndexSet::NoEntities => {
                         // Shouldn't happen…
                         continue;
                     }
-                    UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets { instance_range } => {
-                        UnbatchableBinnedEntityDynamicOffset {
-                            instance_index: instance_range.start + entity_index as u32,
-                            dynamic_offset: None,
-                        }
-                    }
-                    UnbatchableBinnedEntityBufferIndex::DynamicOffsets(ref dynamic_offsets) => {
+                    UnbatchableBinnedEntityIndexSet::Sparse {
+                        instance_range,
+                        first_indirect_parameters_index,
+                    } => UnbatchableBinnedEntityIndices {
+                        instance_index: instance_range.start + entity_index as u32,
+                        extra_index: match first_indirect_parameters_index {
+                            None => PhaseItemExtraIndex::NONE,
+                            Some(first_indirect_parameters_index) => {
+                                PhaseItemExtraIndex::indirect_parameters_index(
+                                    u32::from(*first_indirect_parameters_index)
+                                        + entity_index as u32,
+                                )
+                            }
+                        },
+                    },
+                    UnbatchableBinnedEntityIndexSet::Dense(ref dynamic_offsets) => {
                         dynamic_offsets[entity_index]
                     }
                 };
@@ -266,7 +298,7 @@ where
                     entity,
                     unbatchable_dynamic_offset.instance_index
                         ..(unbatchable_dynamic_offset.instance_index + 1),
-                    unbatchable_dynamic_offset.dynamic_offset,
+                    unbatchable_dynamic_offset.extra_index,
                 );
 
                 // Fetch the draw function.
@@ -300,6 +332,42 @@ where
     }
 }
 
+impl UnbatchableBinnedEntityIndexSet {
+    /// Returns the [`UnbatchableBinnedEntityIndices`] for the given entity.
+    fn indices_for_entity_index(
+        &self,
+        entity_index: u32,
+    ) -> Option<UnbatchableBinnedEntityIndices> {
+        match self {
+            UnbatchableBinnedEntityIndexSet::NoEntities => None,
+            UnbatchableBinnedEntityIndexSet::Sparse { instance_range, .. }
+                if entity_index >= instance_range.len() as u32 =>
+            {
+                None
+            }
+            UnbatchableBinnedEntityIndexSet::Sparse {
+                instance_range,
+                first_indirect_parameters_index: None,
+            } => Some(UnbatchableBinnedEntityIndices {
+                instance_index: instance_range.start + entity_index,
+                extra_index: PhaseItemExtraIndex::NONE,
+            }),
+            UnbatchableBinnedEntityIndexSet::Sparse {
+                instance_range,
+                first_indirect_parameters_index: Some(first_indirect_parameters_index),
+            } => Some(UnbatchableBinnedEntityIndices {
+                instance_index: instance_range.start + entity_index,
+                extra_index: PhaseItemExtraIndex::indirect_parameters_index(
+                    u32::from(*first_indirect_parameters_index) + entity_index,
+                ),
+            }),
+            UnbatchableBinnedEntityIndexSet::Dense(ref indices) => {
+                indices.get(entity_index as usize).copied()
+            }
+        }
+    }
+}
+
 /// A convenient abstraction for adding all the systems necessary for a binned
 /// render phase to the render app.
 ///
@@ -395,74 +463,62 @@ where
     }
 }
 
-impl UnbatchableBinnedEntityBufferIndex {
+impl UnbatchableBinnedEntityIndexSet {
     /// Adds a new entity to the list of unbatchable binned entities.
-    pub fn add<T>(&mut self, gpu_array_buffer_index: GpuArrayBufferIndex<T>)
-    where
-        T: ShaderSize + WriteInto + Clone,
-    {
-        match (&mut *self, gpu_array_buffer_index.dynamic_offset) {
-            (UnbatchableBinnedEntityBufferIndex::NoEntities, None) => {
-                // This is the first entity we've seen, and we're not on WebGL
-                // 2. Initialize the fast path.
-                *self = UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets {
-                    instance_range: gpu_array_buffer_index.index
-                        ..(gpu_array_buffer_index.index + 1),
+    pub fn add(&mut self, indices: UnbatchableBinnedEntityIndices) {
+        match self {
+            UnbatchableBinnedEntityIndexSet::NoEntities => {
+                if indices.extra_index.is_dynamic_offset() {
+                    // This is the first entity we've seen, and we don't have
+                    // compute shaders. Initialize an array.
+                    *self = UnbatchableBinnedEntityIndexSet::Dense(vec![indices]);
+                } else {
+                    // This is the first entity we've seen, and we have compute
+                    // shaders. Initialize the fast path.
+                    *self = UnbatchableBinnedEntityIndexSet::Sparse {
+                        instance_range: indices.instance_index..indices.instance_index + 1,
+                        first_indirect_parameters_index: indices
+                            .extra_index
+                            .as_indirect_parameters_index()
+                            .and_then(|index| NonMaxU32::try_from(index).ok()),
+                    }
                 }
             }
 
-            (UnbatchableBinnedEntityBufferIndex::NoEntities, Some(dynamic_offset)) => {
-                // This is the first entity we've seen, and we're on WebGL 2.
-                // Initialize an array.
-                *self = UnbatchableBinnedEntityBufferIndex::DynamicOffsets(vec![
-                    UnbatchableBinnedEntityDynamicOffset {
-                        instance_index: gpu_array_buffer_index.index,
-                        dynamic_offset: Some(dynamic_offset),
-                    },
-                ]);
-            }
-
-            (
-                UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets {
-                    ref mut instance_range,
-                },
-                None,
-            ) if instance_range.end == gpu_array_buffer_index.index => {
+            UnbatchableBinnedEntityIndexSet::Sparse {
+                ref mut instance_range,
+                first_indirect_parameters_index,
+            } if instance_range.end == indices.instance_index
+                && ((first_indirect_parameters_index.is_none()
+                    && indices.extra_index == PhaseItemExtraIndex::NONE)
+                    || first_indirect_parameters_index.is_some_and(
+                        |first_indirect_parameters_index| {
+                            Some(
+                                u32::from(first_indirect_parameters_index) + instance_range.end
+                                    - instance_range.start,
+                            ) == indices.extra_index.as_indirect_parameters_index()
+                        },
+                    )) =>
+            {
                 // This is the normal case on non-WebGL 2.
                 instance_range.end += 1;
             }
 
-            (
-                UnbatchableBinnedEntityBufferIndex::DynamicOffsets(ref mut offsets),
-                dynamic_offset,
-            ) => {
-                // This is the normal case on WebGL 2.
-                offsets.push(UnbatchableBinnedEntityDynamicOffset {
-                    instance_index: gpu_array_buffer_index.index,
-                    dynamic_offset,
-                });
-            }
-
-            (
-                UnbatchableBinnedEntityBufferIndex::NoDynamicOffsets { instance_range },
-                dynamic_offset,
-            ) => {
+            UnbatchableBinnedEntityIndexSet::Sparse { instance_range, .. } => {
                 // We thought we were in non-WebGL 2 mode, but we got a dynamic
                 // offset or non-contiguous index anyway. This shouldn't happen,
                 // but let's go ahead and do the sensible thing anyhow: demote
                 // the compressed `NoDynamicOffsets` field to the full
                 // `DynamicOffsets` array.
-                let mut new_dynamic_offsets: Vec<_> = instance_range
-                    .map(|instance_index| UnbatchableBinnedEntityDynamicOffset {
-                        instance_index,
-                        dynamic_offset: None,
-                    })
+                let new_dynamic_offsets = (0..instance_range.len() as u32)
+                    .flat_map(|entity_index| self.indices_for_entity_index(entity_index))
+                    .chain(iter::once(indices))
                     .collect();
-                new_dynamic_offsets.push(UnbatchableBinnedEntityDynamicOffset {
-                    instance_index: gpu_array_buffer_index.index,
-                    dynamic_offset,
-                });
-                *self = UnbatchableBinnedEntityBufferIndex::DynamicOffsets(new_dynamic_offsets);
+                *self = UnbatchableBinnedEntityIndexSet::Dense(new_dynamic_offsets);
+            }
+
+            UnbatchableBinnedEntityIndexSet::Dense(ref mut dense_indices) => {
+                dense_indices.push(indices);
             }
         }
     }
@@ -487,6 +543,7 @@ pub struct SortedRenderPhase<I>
 where
     I: SortedPhaseItem,
 {
+    /// The items within this [`SortedRenderPhase`].
     pub items: Vec<I>,
 }
 
@@ -604,8 +661,144 @@ pub trait PhaseItem: Sized + Send + Sync + 'static {
     fn batch_range(&self) -> &Range<u32>;
     fn batch_range_mut(&mut self) -> &mut Range<u32>;
 
-    fn dynamic_offset(&self) -> Option<NonMaxU32>;
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32>;
+    /// Returns the [`PhaseItemExtraIndex`].
+    ///
+    /// If present, this is either a dynamic offset or an indirect parameters
+    /// index.
+    fn extra_index(&self) -> PhaseItemExtraIndex;
+
+    /// Returns a pair of mutable references to both the batch range and extra
+    /// index.
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex);
+}
+
+/// The "extra index" associated with some [`PhaseItem`]s, alongside the
+/// indirect instance index.
+///
+/// Sometimes phase items require another index in addition to the range of
+/// instances they already have. These can be:
+///
+/// * The *dynamic offset*: a `wgpu` dynamic offset into the uniform buffer of
+/// instance data. This is used on platforms that don't support storage
+/// buffers, to work around uniform buffer size limitations.
+///
+/// * The *indirect parameters index*: an index into the buffer that specifies
+/// the indirect parameters for this [`PhaseItem`]'s drawcall. This is used when
+/// indirect mode is on (as used for GPU culling).
+///
+/// Note that our indirect draw functionality requires storage buffers, so it's
+/// impossible to have both a dynamic offset and an indirect parameters index.
+/// This convenient fact allows us to pack both indices into a single `u32`.
+#[derive(Clone, Copy, PartialEq, Eq, Hash)]
+pub struct PhaseItemExtraIndex(pub u32);
+
+impl Debug for PhaseItemExtraIndex {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        if self.is_dynamic_offset() {
+            write!(f, "DynamicOffset({})", self.offset())
+        } else if self.is_indirect_parameters_index() {
+            write!(f, "IndirectParametersIndex({})", self.offset())
+        } else {
+            write!(f, "None")
+        }
+    }
+}
+
+impl PhaseItemExtraIndex {
+    /// The flag that indicates that this index is an indirect parameter. If not
+    /// set, this is a dynamic offset.
+    pub const INDIRECT_PARAMETER_INDEX: u32 = 1 << 31;
+    /// To extract the index from a packed [`PhaseItemExtraIndex`], bitwise-and
+    /// the contents with this value.
+    pub const OFFSET_MASK: u32 = Self::INDIRECT_PARAMETER_INDEX - 1;
+    /// To extract the flag from a packed [`PhaseItemExtraIndex`], bitwise-and
+    /// the contents with this value.
+    pub const FLAGS_MASK: u32 = !Self::OFFSET_MASK;
+
+    /// The special value that indicates that no extra index is present.
+    pub const NONE: PhaseItemExtraIndex = PhaseItemExtraIndex(u32::MAX);
+
+    /// Returns either the indirect parameters index or the dynamic offset,
+    /// depending on which is in use.
+    #[inline]
+    fn offset(&self) -> u32 {
+        self.0 & Self::OFFSET_MASK
+    }
+
+    /// Determines whether this extra index is a dynamic offset.
+    #[inline]
+    fn is_dynamic_offset(&self) -> bool {
+        *self != Self::NONE && (self.0 & Self::INDIRECT_PARAMETER_INDEX) == 0
+    }
+
+    /// Determines whether this extra index is an indirect parameters index.
+    #[inline]
+    fn is_indirect_parameters_index(&self) -> bool {
+        *self != Self::NONE && (self.0 & Self::INDIRECT_PARAMETER_INDEX) != 0
+    }
+
+    /// Packs a indirect parameters index into this extra index.
+    #[inline]
+    pub fn indirect_parameters_index(indirect_parameter_index: u32) -> PhaseItemExtraIndex {
+        // Make sure we didn't overflow.
+        debug_assert_eq!(indirect_parameter_index & Self::FLAGS_MASK, 0);
+        PhaseItemExtraIndex(indirect_parameter_index | Self::INDIRECT_PARAMETER_INDEX)
+    }
+
+    /// Returns either an indirect parameters index or
+    /// [`PhaseItemExtraIndex::NONE`], as appropriate.
+    #[inline]
+    pub fn maybe_indirect_parameters_index(
+        maybe_indirect_parameters_index: Option<NonMaxU32>,
+    ) -> PhaseItemExtraIndex {
+        match maybe_indirect_parameters_index {
+            Some(indirect_parameters_index) => {
+                Self::indirect_parameters_index(indirect_parameters_index.into())
+            }
+            None => PhaseItemExtraIndex::NONE,
+        }
+    }
+
+    /// Packs a dynamic offset into this extra index.
+    #[inline]
+    pub fn dynamic_offset(dynamic_offset: u32) -> PhaseItemExtraIndex {
+        // Make sure we didn't overflow.
+        debug_assert_eq!(dynamic_offset & Self::FLAGS_MASK, 0);
+
+        PhaseItemExtraIndex(dynamic_offset)
+    }
+
+    /// Returns either a dynamic offset or [`PhaseItemExtraIndex::NONE`], as
+    /// appropriate.
+    #[inline]
+    pub fn maybe_dynamic_offset(maybe_dynamic_offset: Option<NonMaxU32>) -> PhaseItemExtraIndex {
+        match maybe_dynamic_offset {
+            Some(dynamic_offset) => Self::dynamic_offset(dynamic_offset.into()),
+            None => PhaseItemExtraIndex::NONE,
+        }
+    }
+
+    /// If this extra index describes a dynamic offset, returns it; otherwise,
+    /// returns `None`.
+    #[inline]
+    pub fn as_dynamic_offset(&self) -> Option<NonMaxU32> {
+        if self.is_dynamic_offset() {
+            NonMaxU32::try_from(self.0 & Self::OFFSET_MASK).ok()
+        } else {
+            None
+        }
+    }
+
+    /// If this extra index describes an indirect parameters index, returns it;
+    /// otherwise, returns `None`.
+    #[inline]
+    pub fn as_indirect_parameters_index(&self) -> Option<u32> {
+        if self.is_indirect_parameters_index() {
+            Some(self.0 & Self::OFFSET_MASK)
+        } else {
+            None
+        }
+    }
 }
 
 /// Represents phase items that are placed into bins. The `BinKey` specifies
@@ -633,7 +826,7 @@ pub trait BinnedPhaseItem: PhaseItem {
         key: Self::BinKey,
         representative_entity: Entity,
         batch_range: Range<u32>,
-        dynamic_offset: Option<NonMaxU32>,
+        extra_index: PhaseItemExtraIndex,
     ) -> Self;
 }
 
diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs
index 8a0f77daafb3d..7ff60c1f44078 100644
--- a/crates/bevy_render/src/render_resource/buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/buffer_vec.rs
@@ -228,6 +228,11 @@ where
         self.len = 0;
     }
 
+    /// Returns the length of the buffer.
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
     /// Materializes the buffer on the GPU with space for `capacity` elements.
     ///
     /// If the buffer is already big enough, this function doesn't reallocate
diff --git a/crates/bevy_render/src/view/mod.rs b/crates/bevy_render/src/view/mod.rs
index 1fef36c5844a0..bdeff34b0d407 100644
--- a/crates/bevy_render/src/view/mod.rs
+++ b/crates/bevy_render/src/view/mod.rs
@@ -208,6 +208,12 @@ pub struct PostProcessWrite<'a> {
     pub destination: &'a TextureView,
 }
 
+#[derive(Component)]
+pub struct GpuCulling;
+
+#[derive(Component)]
+pub struct NoCpuCulling;
+
 impl ViewTarget {
     pub const TEXTURE_FORMAT_HDR: TextureFormat = TextureFormat::Rgba16Float;
 
diff --git a/crates/bevy_render/src/view/visibility/mod.rs b/crates/bevy_render/src/view/visibility/mod.rs
index 72ba3c16a8681..f10d47b710ac7 100644
--- a/crates/bevy_render/src/view/visibility/mod.rs
+++ b/crates/bevy_render/src/view/visibility/mod.rs
@@ -20,6 +20,8 @@ use crate::{
     primitives::{Aabb, Frustum, Sphere},
 };
 
+use super::NoCpuCulling;
+
 /// User indication of whether an entity is visible. Propagates down the entity hierarchy.
 ///
 /// If an entity is hidden in this way, all [`Children`] (and all of their children and so on) who
@@ -397,6 +399,7 @@ pub fn check_visibility<QF>(
         &Frustum,
         Option<&RenderLayers>,
         &Camera,
+        Has<NoCpuCulling>,
     )>,
     mut visible_aabb_query: Query<
         (
@@ -413,7 +416,8 @@ pub fn check_visibility<QF>(
 ) where
     QF: QueryFilter + 'static,
 {
-    for (mut visible_entities, frustum, maybe_view_mask, camera) in &mut view_query {
+    for (mut visible_entities, frustum, maybe_view_mask, camera, no_cpu_culling) in &mut view_query
+    {
         if !camera.is_active {
             continue;
         }
@@ -445,7 +449,7 @@ pub fn check_visibility<QF>(
                 }
 
                 // If we have an aabb, do frustum culling
-                if !no_frustum_culling {
+                if !no_frustum_culling && !no_cpu_culling {
                     if let Some(model_aabb) = maybe_model_aabb {
                         let model = transform.affine();
                         let model_sphere = Sphere {
diff --git a/crates/bevy_sprite/src/mesh2d/material.rs b/crates/bevy_sprite/src/mesh2d/material.rs
index 682d2296817ff..2805840a67694 100644
--- a/crates/bevy_sprite/src/mesh2d/material.rs
+++ b/crates/bevy_sprite/src/mesh2d/material.rs
@@ -17,8 +17,8 @@ use bevy_render::{
         prepare_assets, PrepareAssetError, RenderAsset, RenderAssetPlugin, RenderAssets,
     },
     render_phase::{
-        AddRenderCommand, DrawFunctions, PhaseItem, RenderCommand, RenderCommandResult,
-        SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
+        AddRenderCommand, DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand,
+        RenderCommandResult, SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
     },
     render_resource::{
         AsBindGroup, AsBindGroupError, BindGroup, BindGroupId, BindGroupLayout,
@@ -451,7 +451,7 @@ pub fn queue_material2d_meshes<M: Material2d>(
                 sort_key: FloatOrd(mesh_z + material2d.depth_bias),
                 // Batching is done in batch_and_prepare_render_phase
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 90a67f7517179..0dec7f0cbdac5 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -646,7 +646,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMesh2dBindGroup<I> {
     ) -> RenderCommandResult {
         let mut dynamic_offsets: [u32; 1] = Default::default();
         let mut offset_count = 0;
-        if let Some(dynamic_offset) = item.dynamic_offset() {
+        if let Some(dynamic_offset) = item.extra_index().as_dynamic_offset() {
             dynamic_offsets[offset_count] = dynamic_offset.get();
             offset_count += 1;
         }
diff --git a/crates/bevy_sprite/src/render/mod.rs b/crates/bevy_sprite/src/render/mod.rs
index ff0a475af815c..f0f2b524a32a4 100644
--- a/crates/bevy_sprite/src/render/mod.rs
+++ b/crates/bevy_sprite/src/render/mod.rs
@@ -19,8 +19,8 @@ use bevy_math::{Affine3A, FloatOrd, Quat, Rect, Vec2, Vec4};
 use bevy_render::{
     render_asset::RenderAssets,
     render_phase::{
-        DrawFunctions, PhaseItem, RenderCommand, RenderCommandResult, SetItemPipeline,
-        SortedRenderPhase, TrackedRenderPass,
+        DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand, RenderCommandResult,
+        SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
     },
     render_resource::{
         binding_types::{sampler, texture_2d, uniform_buffer},
@@ -516,7 +516,7 @@ pub fn queue_sprites(
                 sort_key,
                 // batch_range and dynamic_offset will be calculated in prepare_sprites
                 batch_range: 0..0,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/crates/bevy_ui/src/render/mod.rs b/crates/bevy_ui/src/render/mod.rs
index d4c27a2141b33..881e6918fd88f 100644
--- a/crates/bevy_ui/src/render/mod.rs
+++ b/crates/bevy_ui/src/render/mod.rs
@@ -7,8 +7,12 @@ use bevy_core_pipeline::core_2d::graph::{Core2d, Node2d};
 use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
 use bevy_core_pipeline::{core_2d::Camera2d, core_3d::Camera3d};
 use bevy_hierarchy::Parent;
-use bevy_render::texture::GpuImage;
-use bevy_render::{render_phase::PhaseItem, view::ViewVisibility, ExtractSchedule, Render};
+use bevy_render::{
+    render_phase::{PhaseItem, PhaseItemExtraIndex},
+    texture::GpuImage,
+    view::ViewVisibility,
+    ExtractSchedule, Render,
+};
 use bevy_sprite::{SpriteAssetEvents, TextureAtlas};
 pub use pipeline::*;
 pub use render_pass::*;
@@ -901,7 +905,7 @@ pub fn queue_uinodes(
             ),
             // batch_range will be calculated in prepare_uinodes
             batch_range: 0..0,
-            dynamic_offset: None,
+            extra_index: PhaseItemExtraIndex::NONE,
         });
     }
 }
diff --git a/crates/bevy_ui/src/render/render_pass.rs b/crates/bevy_ui/src/render/render_pass.rs
index e398a46d93d24..d403a44bedc7f 100644
--- a/crates/bevy_ui/src/render/render_pass.rs
+++ b/crates/bevy_ui/src/render/render_pass.rs
@@ -15,7 +15,6 @@ use bevy_render::{
     renderer::*,
     view::*,
 };
-use nonmax::NonMaxU32;
 
 pub struct UiPassNode {
     ui_view_query: QueryState<
@@ -92,7 +91,7 @@ pub struct TransparentUi {
     pub pipeline: CachedRenderPipelineId,
     pub draw_function: DrawFunctionId,
     pub batch_range: Range<u32>,
-    pub dynamic_offset: Option<NonMaxU32>,
+    pub extra_index: PhaseItemExtraIndex,
 }
 
 impl PhaseItem for TransparentUi {
@@ -117,13 +116,13 @@ impl PhaseItem for TransparentUi {
     }
 
     #[inline]
-    fn dynamic_offset(&self) -> Option<NonMaxU32> {
-        self.dynamic_offset
+    fn extra_index(&self) -> PhaseItemExtraIndex {
+        self.extra_index
     }
 
     #[inline]
-    fn dynamic_offset_mut(&mut self) -> &mut Option<NonMaxU32> {
-        &mut self.dynamic_offset
+    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
+        (&mut self.batch_range, &mut self.extra_index)
     }
 }
 
diff --git a/crates/bevy_ui/src/render/ui_material_pipeline.rs b/crates/bevy_ui/src/render/ui_material_pipeline.rs
index b4d13ac65ac13..b1beaaa5d11c6 100644
--- a/crates/bevy_ui/src/render/ui_material_pipeline.rs
+++ b/crates/bevy_ui/src/render/ui_material_pipeline.rs
@@ -671,7 +671,7 @@ pub fn queue_ui_material_nodes<M: UiMaterial>(
                 entity.index(),
             ),
             batch_range: 0..0,
-            dynamic_offset: None,
+            extra_index: PhaseItemExtraIndex::NONE,
         });
     }
 }
diff --git a/examples/2d/mesh2d_manual.rs b/examples/2d/mesh2d_manual.rs
index 9be4c11f8067a..343db4be1d153 100644
--- a/examples/2d/mesh2d_manual.rs
+++ b/examples/2d/mesh2d_manual.rs
@@ -13,7 +13,10 @@ use bevy::{
     render::{
         mesh::{GpuMesh, Indices, MeshVertexAttribute},
         render_asset::{RenderAssetUsages, RenderAssets},
-        render_phase::{AddRenderCommand, DrawFunctions, SetItemPipeline, SortedRenderPhase},
+        render_phase::{
+            AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline,
+            SortedRenderPhase,
+        },
         render_resource::{
             BlendState, ColorTargetState, ColorWrites, Face, FragmentState, FrontFace,
             MultisampleState, PipelineCache, PolygonMode, PrimitiveState, PrimitiveTopology,
@@ -392,7 +395,7 @@ pub fn queue_colored_mesh2d(
                     sort_key: FloatOrd(mesh_z),
                     // This material is not batched
                     batch_range: 0..1,
-                    dynamic_offset: None,
+                    extra_index: PhaseItemExtraIndex::NONE,
                 });
             }
         }
diff --git a/examples/3d/deferred_rendering.rs b/examples/3d/deferred_rendering.rs
index 165d1de9098fc..6ee41a335a60b 100644
--- a/examples/3d/deferred_rendering.rs
+++ b/examples/3d/deferred_rendering.rs
@@ -7,9 +7,10 @@ use bevy::{
         fxaa::Fxaa,
         prepass::{DeferredPrepass, DepthPrepass, MotionVectorPrepass, NormalPrepass},
     },
-    pbr::NotShadowReceiver,
-    pbr::{CascadeShadowConfigBuilder, DirectionalLightShadowMap},
-    pbr::{DefaultOpaqueRendererMethod, NotShadowCaster, OpaqueRendererMethod},
+    pbr::{
+        CascadeShadowConfigBuilder, DefaultOpaqueRendererMethod, DirectionalLightShadowMap,
+        NotShadowCaster, NotShadowReceiver, OpaqueRendererMethod,
+    },
     prelude::*,
     render::render_resource::TextureFormat,
 };
diff --git a/examples/shader/shader_instancing.rs b/examples/shader/shader_instancing.rs
index b1c91c8973b07..17fd7823c95ad 100644
--- a/examples/shader/shader_instancing.rs
+++ b/examples/shader/shader_instancing.rs
@@ -15,8 +15,8 @@ use bevy::{
         mesh::{GpuBufferInfo, GpuMesh, MeshVertexBufferLayoutRef},
         render_asset::RenderAssets,
         render_phase::{
-            AddRenderCommand, DrawFunctions, PhaseItem, RenderCommand, RenderCommandResult,
-            SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
+            AddRenderCommand, DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand,
+            RenderCommandResult, SetItemPipeline, SortedRenderPhase, TrackedRenderPass,
         },
         render_resource::*,
         renderer::RenderDevice,
@@ -144,7 +144,7 @@ fn queue_custom(
                 draw_function: draw_custom,
                 distance: rangefinder.distance_translation(&mesh_instance.translation),
                 batch_range: 0..1,
-                dynamic_offset: None,
+                extra_index: PhaseItemExtraIndex::NONE,
             });
         }
     }
diff --git a/examples/stress_tests/many_cubes.rs b/examples/stress_tests/many_cubes.rs
index 8d583d0535e6a..a10ced7d0f74b 100644
--- a/examples/stress_tests/many_cubes.rs
+++ b/examples/stress_tests/many_cubes.rs
@@ -20,7 +20,7 @@ use bevy::{
         batching::NoAutomaticBatching,
         render_asset::RenderAssetUsages,
         render_resource::{Extent3d, TextureDimension, TextureFormat},
-        view::NoFrustumCulling,
+        view::{GpuCulling, NoCpuCulling, NoFrustumCulling},
     },
     window::{PresentMode, WindowResolution},
     winit::{UpdateMode, WinitSettings},
@@ -51,7 +51,7 @@ struct Args {
     #[argh(option, default = "1")]
     mesh_count: usize,
 
-    /// whether to disable frustum culling. Stresses queuing and batching as all mesh material entities in the scene are always drawn.
+    /// whether to disable all frustum culling. Stresses queuing and batching as all mesh material entities in the scene are always drawn.
     #[argh(switch)]
     no_frustum_culling: bool,
 
@@ -59,6 +59,14 @@ struct Args {
     #[argh(switch)]
     no_automatic_batching: bool,
 
+    /// whether to enable GPU culling.
+    #[argh(switch)]
+    gpu_culling: bool,
+
+    /// whether to disable CPU culling.
+    #[argh(switch)]
+    no_cpu_culling: bool,
+
     /// whether to enable directional light cascaded shadow mapping.
     #[argh(switch)]
     shadows: bool,
@@ -172,7 +180,14 @@ fn setup(
             }
 
             // camera
-            commands.spawn(Camera3dBundle::default());
+            let mut camera = commands.spawn(Camera3dBundle::default());
+            if args.gpu_culling {
+                camera.insert(GpuCulling);
+            }
+            if args.no_cpu_culling {
+                camera.insert(NoCpuCulling);
+            }
+
             // Inside-out box around the meshes onto which shadows are cast (though you cannot see them...)
             commands.spawn((
                 PbrBundle {