From 1bcbee8346663f788ba3921d806738afebfe7489 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Thu, 28 Mar 2024 17:09:13 -0700
Subject: [PATCH 01/39] Generate `MeshUniform`s on the GPU via compute shader
 where available.

Currently, `MeshUniform`s are rather large: 160 bytes. They're also
somewhat expensive to compute, because they involve taking the inverse
of a 3x4 matrix. Finally, if a mesh is present in multiple views, that
mesh will have a separate `MeshUniform` for each and every view, which
is wasteful.

This commit fixes these issues by introducing the concept of a *mesh
input uniform* and adding a *mesh uniform building* compute shader pass.
The `MeshInputUniform` is simply the minimum amount of data needed for
the GPU to compute the full `MeshUniform`.  Most of this data is simply
the transform and is therefore only 64 bytes. `MeshInputUniform`s are
computed during the *extraction* phase, much like skins are today, in
order to avoid needlessly copying transforms around on CPU. (In fact,
the render app has been changed to only store the translation of each
mesh; it no longer cares about any other part of the transform, which is
stored only on the GPU and the main world.) Before rendering, the
`build_mesh_uniforms` pass runs to expand the `MeshInputUniform`s to the
full `MeshUniform`.

The mesh uniform building pass does the following:

1. Copy the appropriate fields of the `MeshInputUniform` to the
   `MeshUniform` slot. If a single mesh is present in multiple views,
   this effectively duplicates it into each view.

2. Compute the inverse transpose of the model transform, used for
   transforming normals.

3. If applicable, copy the mesh's transform from the previous frame for
   TAA. To support this, we double-buffer the `MeshInputUniform`s over
   two frames and swap the buffers each frame. The `MeshInputUniform`s
   for the current frame contain the index of that mesh's
   `MeshInputUniform` for the previous frame.

This commit produces wins in virtually every CPU part of the pipeline:
`extract_meshes`, `queue_material_meshes`,
`batch_and_prepare_render_phase`, and especially
`write_batched_instance_buffer` are all faster. Shrinking the amount of
CPU data that has to be shuffled around speeds up the entire rendering
process.

| Benchmark              | This branch | `main`  | Speedup |
|------------------------|-------------|---------|---------|
| `many_cubes -nfc`      |      21.878 |  30.117 |  37.65% |
| `many_cubes -nfc -vpi` |     302.116 | 312.123 |   3.31% |
| `many_foxes`           |       3.227 |   3.515 |   8.92% |

Because mesh uniform building requires compute shader, and WebGL 2 has
no compute shader, the existing CPU mesh uniform building code has been
left as-is. Many types now have both CPU mesh uniform building and GPU
mesh uniform building modes. Developers can opt into the old CPU mesh
uniform building by setting the `using_gpu_uniform_builder` option on
`PbrPlugin` to `false`.
---
 crates/bevy_pbr/src/lib.rs                    |  12 +-
 crates/bevy_pbr/src/lightmap/mod.rs           |   8 +-
 crates/bevy_pbr/src/material.rs               |  12 +-
 crates/bevy_pbr/src/prepass/mod.rs            |  13 +-
 .../src/render/build_mesh_uniforms.rs         | 226 ++++++++++
 .../src/render/build_mesh_uniforms.wgsl       |  76 ++++
 crates/bevy_pbr/src/render/light.rs           |   8 +-
 crates/bevy_pbr/src/render/mesh.rs            | 394 +++++++++++++++---
 crates/bevy_pbr/src/render/mod.rs             |   2 +
 crates/bevy_render/src/batching/mod.rs        | 134 +++++-
 crates/bevy_render/src/maths.wgsl             |  18 +
 .../src/render_resource/buffer_vec.rs         |  78 ++++
 crates/bevy_sprite/src/mesh2d/mesh.rs         |  37 +-
 examples/shader/shader_instancing.rs          |   8 +-
 14 files changed, 933 insertions(+), 93 deletions(-)
 create mode 100644 crates/bevy_pbr/src/render/build_mesh_uniforms.rs
 create mode 100644 crates/bevy_pbr/src/render/build_mesh_uniforms.wgsl

diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index 1f90e1a21a439..85b3ed3a631b8 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -78,6 +78,7 @@ pub mod graph {
         /// Label for the screen space ambient occlusion render node.
         ScreenSpaceAmbientOcclusion,
         DeferredLightingPass,
+        BuildMeshUniforms,
     }
 }
 
@@ -133,6 +134,8 @@ pub struct PbrPlugin {
     pub prepass_enabled: bool,
     /// Controls if [`DeferredPbrLightingPlugin`] is added.
     pub add_default_deferred_lighting_plugin: bool,
+    /// Controls if GPU [`MeshUniform`] building is enabled.
+    pub using_gpu_uniform_builder: bool,
 }
 
 impl Default for PbrPlugin {
@@ -140,6 +143,7 @@ impl Default for PbrPlugin {
         Self {
             prepass_enabled: true,
             add_default_deferred_lighting_plugin: true,
+            using_gpu_uniform_builder: true,
         }
     }
 }
@@ -280,7 +284,9 @@ impl Plugin for PbrPlugin {
             .register_type::<DefaultOpaqueRendererMethod>()
             .init_resource::<DefaultOpaqueRendererMethod>()
             .add_plugins((
-                MeshRenderPlugin,
+                MeshRenderPlugin {
+                    using_gpu_uniform_builder: self.using_gpu_uniform_builder,
+                },
                 MaterialPlugin::<StandardMaterial> {
                     prepass_enabled: self.prepass_enabled,
                     ..Default::default()
@@ -352,6 +358,10 @@ impl Plugin for PbrPlugin {
             app.add_plugins(DeferredPbrLightingPlugin);
         }
 
+        if self.using_gpu_uniform_builder {
+            app.add_plugins(BuildMeshUniformsPlugin);
+        }
+
         app.world.resource_mut::<Assets<StandardMaterial>>().insert(
             &Handle::<StandardMaterial>::default(),
             StandardMaterial {
diff --git a/crates/bevy_pbr/src/lightmap/mod.rs b/crates/bevy_pbr/src/lightmap/mod.rs
index 30c4bc631fb21..4492ce007efe8 100644
--- a/crates/bevy_pbr/src/lightmap/mod.rs
+++ b/crates/bevy_pbr/src/lightmap/mod.rs
@@ -132,7 +132,9 @@ impl Plugin for LightmapPlugin {
 
         render_app.init_resource::<RenderLightmaps>().add_systems(
             ExtractSchedule,
-            extract_lightmaps.after(crate::extract_meshes),
+            extract_lightmaps
+                .after(crate::extract_meshes_for_cpu_building)
+                .after(crate::extract_meshes_for_gpu_building),
         );
     }
 }
@@ -157,8 +159,8 @@ fn extract_lightmaps(
         if !view_visibility.get()
             || images.get(&lightmap.image).is_none()
             || !render_mesh_instances
-                .get(&entity)
-                .and_then(|mesh_instance| meshes.get(mesh_instance.mesh_asset_id))
+                .mesh_asset_id(entity)
+                .and_then(|mesh_asset_id| meshes.get(mesh_asset_id))
                 .is_some_and(|mesh| mesh.layout.0.contains(Mesh::ATTRIBUTE_UV_1.id))
         {
             continue;
diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index 4e42cde7f5696..f33a8ede60654 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -649,7 +649,8 @@ pub fn queue_material_meshes<M: Material>(
             let Some(material_asset_id) = render_material_instances.get(visible_entity) else {
                 continue;
             };
-            let Some(mesh_instance) = render_mesh_instances.get(visible_entity) else {
+            let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(*visible_entity)
+            else {
                 continue;
             };
             let Some(mesh) = render_meshes.get(mesh_instance.mesh_asset_id) else {
@@ -710,8 +711,7 @@ pub fn queue_material_meshes<M: Material>(
             match material.properties.alpha_mode {
                 AlphaMode::Opaque => {
                     if material.properties.reads_view_transmission_texture {
-                        let distance = rangefinder
-                            .distance_translation(&mesh_instance.transforms.transform.translation)
+                        let distance = rangefinder.distance_translation(&mesh_instance.translation)
                             + material.properties.depth_bias;
                         transmissive_phase.add(Transmissive3d {
                             entity: *visible_entity,
@@ -734,8 +734,7 @@ pub fn queue_material_meshes<M: Material>(
                 }
                 AlphaMode::Mask(_) => {
                     if material.properties.reads_view_transmission_texture {
-                        let distance = rangefinder
-                            .distance_translation(&mesh_instance.transforms.transform.translation)
+                        let distance = rangefinder.distance_translation(&mesh_instance.translation)
                             + material.properties.depth_bias;
                         transmissive_phase.add(Transmissive3d {
                             entity: *visible_entity,
@@ -760,8 +759,7 @@ pub fn queue_material_meshes<M: Material>(
                 | AlphaMode::Premultiplied
                 | AlphaMode::Add
                 | AlphaMode::Multiply => {
-                    let distance = rangefinder
-                        .distance_translation(&mesh_instance.transforms.transform.translation)
+                    let distance = rangefinder.distance_translation(&mesh_instance.translation)
                         + material.properties.depth_bias;
                     transparent_phase.add(Transparent3d {
                         entity: *visible_entity,
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index 6e78c8f4c8c5e..f0ce0d7c31b58 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -156,8 +156,14 @@ where
                     Render,
                     (
                         prepare_previous_view_projection_uniforms,
-                        batch_and_prepare_render_phase::<Opaque3dPrepass, MeshPipeline>,
-                        batch_and_prepare_render_phase::<AlphaMask3dPrepass, MeshPipeline>,
+                        batch_and_prepare_render_phase::<
+                            Opaque3dPrepass,
+                            MeshPipeline,
+                        >,
+                        batch_and_prepare_render_phase::<
+                            AlphaMask3dPrepass,
+                            MeshPipeline,
+                        >,
                     )
                         .in_set(RenderSet::PrepareResources),
                 );
@@ -773,7 +779,8 @@ pub fn queue_prepass_material_meshes<M: Material>(
             let Some(material_asset_id) = render_material_instances.get(visible_entity) else {
                 continue;
             };
-            let Some(mesh_instance) = render_mesh_instances.get(visible_entity) else {
+            let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(*visible_entity)
+            else {
                 continue;
             };
             let Some(material) = render_materials.get(material_asset_id) else {
diff --git a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
new file mode 100644
index 0000000000000..4db04f516a9fb
--- /dev/null
+++ b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
@@ -0,0 +1,226 @@
+//! Build mesh uniforms.
+
+use bevy_app::{App, Plugin};
+use bevy_asset::{load_internal_asset, Handle};
+use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
+use bevy_ecs::{
+    query::QueryItem,
+    schedule::IntoSystemConfigs as _,
+    system::{Res, ResMut, Resource},
+    world::{FromWorld, World},
+};
+use bevy_render::{
+    batching::BatchedInstanceBuffers,
+    render_graph::{NodeRunError, RenderGraphApp, RenderGraphContext, ViewNode, ViewNodeRunner},
+    render_resource::{
+        binding_types::{storage_buffer, storage_buffer_read_only},
+        BindGroupEntries, BindGroupLayout, CachedComputePipelineId, ComputePassDescriptor,
+        ComputePipelineDescriptor, DynamicBindGroupLayoutEntries, PipelineCache, Shader,
+        ShaderStages, SpecializedComputePipeline, SpecializedComputePipelines,
+    },
+    renderer::{RenderContext, RenderDevice},
+    Render, RenderApp, RenderSet,
+};
+use bevy_utils::tracing::warn;
+
+use crate::{graph::NodePbr, MeshInputUniform, MeshUniform};
+
+pub const BUILD_MESH_UNIFORMS_SHADER_HANDLE: Handle<Shader> =
+    Handle::weak_from_u128(16991728318640779533);
+
+const WORKGROUP_SIZE: usize = 64;
+
+pub struct BuildMeshUniformsPlugin;
+
+#[derive(Default)]
+pub struct BuildMeshUniformsNode;
+
+#[derive(Resource)]
+pub struct BuildMeshUniformsPipeline {
+    pub bind_group_layout: BindGroupLayout,
+    /// This gets filled in in `prepare_build_mesh_uniforms_pipeline`.
+    pub pipeline_id: Option<CachedComputePipelineId>,
+}
+
+impl Plugin for BuildMeshUniformsPlugin {
+    fn build(&self, app: &mut App) {
+        load_internal_asset!(
+            app,
+            BUILD_MESH_UNIFORMS_SHADER_HANDLE,
+            "build_mesh_uniforms.wgsl",
+            Shader::from_wgsl
+        );
+
+        let Ok(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.add_systems(
+            Render,
+            prepare_build_mesh_uniforms_pipeline.in_set(RenderSet::Prepare),
+        );
+    }
+
+    fn finish(&self, app: &mut App) {
+        let Ok(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app
+            .add_render_graph_node::<ViewNodeRunner<BuildMeshUniformsNode>>(
+                Core3d,
+                NodePbr::BuildMeshUniforms,
+            )
+            .add_render_graph_edges(
+                Core3d,
+                (
+                    Node3d::StartMainPass,
+                    NodePbr::BuildMeshUniforms,
+                    Node3d::MainOpaquePass,
+                ),
+            )
+            .init_resource::<BuildMeshUniformsPipeline>()
+            .init_resource::<SpecializedComputePipelines<BuildMeshUniformsPipeline>>();
+    }
+}
+
+impl ViewNode for BuildMeshUniformsNode {
+    type ViewQuery = ();
+
+    fn run<'w>(
+        &self,
+        _: &mut RenderGraphContext,
+        render_context: &mut RenderContext<'w>,
+        _: QueryItem<'w, Self::ViewQuery>,
+        world: &'w World,
+    ) -> Result<(), NodeRunError> {
+        let BatchedInstanceBuffers::GpuBuilt {
+            data_buffer: ref data_buffer_vec,
+            index_buffer: ref index_buffer_vec,
+            current_input_buffer: ref current_input_buffer_vec,
+            previous_input_buffer: ref previous_input_buffer_vec,
+            index_count,
+        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
+        else {
+            return Ok(());
+        };
+
+        let pipeline_cache = world.resource::<PipelineCache>();
+        let build_mesh_uniforms_pipeline = world.resource::<BuildMeshUniformsPipeline>();
+
+        let Some(build_mesh_uniforms_pipeline_id) = build_mesh_uniforms_pipeline.pipeline_id else {
+            warn!("The build mesh uniforms pipeline wasn't uploaded");
+            return Ok(());
+        };
+
+        let Some(view_build_mesh_uniforms_pipeline) =
+            pipeline_cache.get_compute_pipeline(build_mesh_uniforms_pipeline_id)
+        else {
+            warn!("The view build mesh uniforms pipeline wasn't present in the pipeline cache");
+            return Ok(());
+        };
+
+        let Some(current_input_buffer) = current_input_buffer_vec.buffer() else {
+            warn!("The current input buffer wasn't uploaded");
+            return Ok(());
+        };
+        let Some(previous_input_buffer) = previous_input_buffer_vec.buffer() else {
+            warn!("The previous input buffer wasn't uploaded");
+            return Ok(());
+        };
+        let Some(index_buffer) = index_buffer_vec.buffer() else {
+            warn!("The index buffer wasn't uploaded");
+            return Ok(());
+        };
+        let Some(data_buffer) = data_buffer_vec.buffer() else {
+            warn!("The data buffer wasn't uploaded");
+            return Ok(());
+        };
+
+        // TODO: Do this in a separate system and cache it.
+        let bind_group = render_context.render_device().create_bind_group(
+            "build_mesh_uniforms_bind_group",
+            &build_mesh_uniforms_pipeline.bind_group_layout,
+            &BindGroupEntries::sequential((
+                current_input_buffer.as_entire_binding(),
+                previous_input_buffer.as_entire_binding(),
+                index_buffer.as_entire_binding(),
+                data_buffer.as_entire_binding(),
+            )),
+        );
+
+        let mut compute_pass =
+            render_context
+                .command_encoder()
+                .begin_compute_pass(&ComputePassDescriptor {
+                    label: Some("build mesh uniforms"),
+                    timestamp_writes: None,
+                });
+
+        compute_pass.set_pipeline(view_build_mesh_uniforms_pipeline);
+        compute_pass.set_bind_group(0, &bind_group, &[]);
+        let workgroup_count = div_round_up(*index_count, WORKGROUP_SIZE);
+        compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+
+        Ok(())
+    }
+}
+
+impl SpecializedComputePipeline for BuildMeshUniformsPipeline {
+    type Key = ();
+
+    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
+        ComputePipelineDescriptor {
+            label: Some("build mesh uniforms".into()),
+            layout: vec![self.bind_group_layout.clone()],
+            push_constant_ranges: vec![],
+            shader: BUILD_MESH_UNIFORMS_SHADER_HANDLE,
+            shader_defs: vec![],
+            entry_point: "main".into(),
+        }
+    }
+}
+
+impl FromWorld for BuildMeshUniformsPipeline {
+    fn from_world(world: &mut World) -> Self {
+        let render_device = world.resource::<RenderDevice>();
+
+        let bind_group_layout_entries = DynamicBindGroupLayoutEntries::sequential(
+            ShaderStages::COMPUTE,
+            (
+                storage_buffer_read_only::<MeshInputUniform>(/*has_dynamic_offset=*/ false),
+                storage_buffer_read_only::<MeshInputUniform>(/*has_dynamic_offset=*/ false),
+                storage_buffer_read_only::<u32>(/*has_dynamic_offset=*/ false),
+                storage_buffer::<MeshUniform>(/*has_dynamic_offset=*/ false),
+            ),
+        );
+
+        let bind_group_layout = render_device.create_bind_group_layout(
+            "build mesh uniforms bind group layout",
+            &bind_group_layout_entries,
+        );
+
+        BuildMeshUniformsPipeline {
+            bind_group_layout,
+            pipeline_id: None,
+        }
+    }
+}
+
+pub fn prepare_build_mesh_uniforms_pipeline(
+    pipeline_cache: Res<PipelineCache>,
+    mut pipelines: ResMut<SpecializedComputePipelines<BuildMeshUniformsPipeline>>,
+    mut build_mesh_uniforms_pipeline: ResMut<BuildMeshUniformsPipeline>,
+) {
+    if build_mesh_uniforms_pipeline.pipeline_id.is_some() {
+        return;
+    }
+
+    let build_mesh_uniforms_pipeline_id =
+        pipelines.specialize(&pipeline_cache, &build_mesh_uniforms_pipeline, ());
+    build_mesh_uniforms_pipeline.pipeline_id = Some(build_mesh_uniforms_pipeline_id);
+}
+
+fn div_round_up(a: usize, b: usize) -> usize {
+    (a + b - 1) / b
+}
diff --git a/crates/bevy_pbr/src/render/build_mesh_uniforms.wgsl b/crates/bevy_pbr/src/render/build_mesh_uniforms.wgsl
new file mode 100644
index 0000000000000..a013e057fbf68
--- /dev/null
+++ b/crates/bevy_pbr/src/render/build_mesh_uniforms.wgsl
@@ -0,0 +1,76 @@
+// GPU mesh uniform building.
+//
+// This is a compute shader that expands each `MeshInputUniform` out to a full
+// `MeshUniform` for each view before rendering. (Thus `MeshInputUniform`
+// and `MeshUniform` are in a 1:N relationship.) It runs in parallel for all
+// meshes for all views. As part of this process, the shader gathers each
+// mesh's transform on the previous frame and writes it into the `MeshUniform`
+// so that TAA works.
+
+#import bevy_pbr::mesh_types::Mesh
+#import bevy_render::maths
+
+// Per-frame data that the CPU supplies to the GPU.
+struct MeshInput {
+    // The model transform.
+    model: mat3x4<f32>,
+    // The lightmap UV rect, packed into 64 bits.
+    lightmap_uv_rect: vec2<u32>,
+    // Various flags.
+    flags: u32,
+    // The index of this mesh's `MeshInput` in the `previous_input` array, if
+    // applicable. If not present, this is `~0`.
+    previous_input_index: u32,
+}
+
+// The current frame's `MeshInput`.
+@group(0) @binding(0) var<storage> current_input: array<MeshInput>;
+// The `MeshInput` values from the previous frame.
+@group(0) @binding(1) var<storage> previous_input: array<MeshInput>;
+// Indices into the `MeshInput` buffer.
+//
+// There may be many indices that map to the same `MeshInput`.
+@group(0) @binding(2) var<storage> indices: array<u32>;
+// The output array of `Mesh`es.
+@group(0) @binding(3) var<storage, read_write> output: array<Mesh>;
+
+@compute
+@workgroup_size(64)
+fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
+    let instance_index = global_invocation_id.x;
+    if (instance_index >= arrayLength(&output)) {
+        return;
+    }
+
+    // Unpack.
+    let mesh_index = indices[instance_index];
+    let model_affine_transpose = current_input[mesh_index].model;
+    let model = maths::affine3_to_square(model_affine_transpose);
+
+    // Calculate inverse transpose.
+    let inverse_transpose_model = transpose(maths::inverse_affine3(transpose(
+        model_affine_transpose)));
+
+    // Pack inverse transpose.
+    let inverse_transpose_model_a = mat2x4<f32>(
+        vec4<f32>(inverse_transpose_model[0].xyz, inverse_transpose_model[1].x),
+        vec4<f32>(inverse_transpose_model[1].yz, inverse_transpose_model[2].xy));
+    let inverse_transpose_model_b = inverse_transpose_model[2].z;
+
+    // Look up the previous model matrix.
+    let previous_input_index = current_input[mesh_index].previous_input_index;
+    var previous_model: mat3x4<f32>;
+    if (previous_input_index == 0xffffffff) {
+        previous_model = model_affine_transpose;
+    } else {
+        previous_model = previous_input[previous_input_index].model;
+    }
+
+    // Write the output.
+    output[instance_index].model = model_affine_transpose;
+    output[instance_index].previous_model = previous_model;
+    output[instance_index].inverse_transpose_model_a = inverse_transpose_model_a;
+    output[instance_index].inverse_transpose_model_b = inverse_transpose_model_b;
+    output[instance_index].flags = current_input[mesh_index].flags;
+    output[instance_index].lightmap_uv_rect = current_input[mesh_index].lightmap_uv_rect;
+}
diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs
index 12d8961f988a4..46eab5170491e 100644
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@@ -1644,10 +1644,14 @@ pub fn queue_shadows<M: Material>(
             // NOTE: Lights with shadow mapping disabled will have no visible entities
             // so no meshes will be queued
             for entity in visible_entities.iter().copied() {
-                let Some(mesh_instance) = render_mesh_instances.get(&entity) else {
+                let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(entity)
+                else {
                     continue;
                 };
-                if !mesh_instance.shadow_caster {
+                if !mesh_instance
+                    .flags
+                    .contains(RenderMeshInstanceFlags::SHADOW_CASTER)
+                {
                     continue;
                 }
                 let Some(material_asset_id) = render_material_instances.get(&entity) else {
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 31f3a29352a28..d009e0f99bfca 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -1,20 +1,22 @@
+use std::mem;
+
 use bevy_asset::{load_internal_asset, AssetId};
 use bevy_core_pipeline::{
     core_3d::{AlphaMask3d, Opaque3d, Transmissive3d, Transparent3d, CORE_3D_DEPTH_FORMAT},
     deferred::{AlphaMask3dDeferred, Opaque3dDeferred},
 };
-use bevy_derive::{Deref, DerefMut};
+use bevy_derive::Deref;
 use bevy_ecs::entity::EntityHashMap;
 use bevy_ecs::{
     prelude::*,
     query::ROQueryItem,
     system::{lifetimeless::*, SystemParamItem, SystemState},
 };
-use bevy_math::{Affine3, Rect, UVec2, Vec4};
+use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
     batching::{
-        batch_and_prepare_render_phase, write_batched_instance_buffer, GetBatchData,
-        NoAutomaticBatching,
+        batch_and_prepare_render_phase, write_batched_instance_buffer, BatchedInstanceBuffers,
+        GetBatchData, NoAutomaticBatching,
     },
     mesh::*,
     render_asset::RenderAssets,
@@ -30,6 +32,7 @@ use bevy_utils::{tracing::error, Entry, HashMap, Parallel};
 
 #[cfg(debug_assertions)]
 use bevy_utils::warn_once;
+use bytemuck::{Pod, Zeroable};
 
 use crate::render::{
     morph::{
@@ -43,8 +46,9 @@ use self::irradiance_volume::IRRADIANCE_VOLUMES_ARE_USABLE;
 
 use super::skin::SkinIndices;
 
-#[derive(Default)]
-pub struct MeshRenderPlugin;
+pub struct MeshRenderPlugin {
+    pub using_gpu_uniform_builder: bool,
+}
 
 pub const FORWARD_IO_HANDLE: Handle<Shader> = Handle::weak_from_u128(2645551199423808407);
 pub const MESH_VIEW_TYPES_HANDLE: Handle<Shader> = Handle::weak_from_u128(8140454348013264787);
@@ -67,6 +71,14 @@ pub const MORPH_HANDLE: Handle<Shader> = Handle::weak_from_u128(9709828135876073
 #[cfg(debug_assertions)]
 pub const MESH_PIPELINE_VIEW_LAYOUT_SAFE_MAX_TEXTURES: usize = 10;
 
+impl Default for MeshRenderPlugin {
+    fn default() -> Self {
+        Self {
+            using_gpu_uniform_builder: true,
+        }
+    }
+}
+
 impl Plugin for MeshRenderPlugin {
     fn build(&self, app: &mut App) {
         load_internal_asset!(app, FORWARD_IO_HANDLE, "forward_io.wgsl", Shader::from_wgsl);
@@ -109,18 +121,16 @@ impl Plugin for MeshRenderPlugin {
         );
 
         if let Ok(render_app) = app.get_sub_app_mut(RenderApp) {
+            let render_mesh_instances = RenderMeshInstances::new(self.using_gpu_uniform_builder);
+
             render_app
-                .init_resource::<RenderMeshInstances>()
                 .init_resource::<MeshBindGroups>()
                 .init_resource::<SkinUniform>()
                 .init_resource::<SkinIndices>()
                 .init_resource::<MorphUniform>()
                 .init_resource::<MorphIndices>()
-                .allow_ambiguous_resource::<GpuArrayBuffer<MeshUniform>>()
-                .add_systems(
-                    ExtractSchedule,
-                    (extract_meshes, extract_skins, extract_morphs),
-                )
+                .insert_resource(render_mesh_instances)
+                .add_systems(ExtractSchedule, (extract_skins, extract_morphs))
                 .add_systems(
                     Render,
                     (
@@ -142,6 +152,12 @@ impl Plugin for MeshRenderPlugin {
                         prepare_mesh_view_bind_groups.in_set(RenderSet::PrepareBindGroups),
                     ),
                 );
+
+            if self.using_gpu_uniform_builder {
+                render_app.add_systems(ExtractSchedule, extract_meshes_for_gpu_building);
+            } else {
+                render_app.add_systems(ExtractSchedule, extract_meshes_for_cpu_building);
+            }
         }
     }
 
@@ -149,9 +165,16 @@ impl Plugin for MeshRenderPlugin {
         let mut mesh_bindings_shader_defs = Vec::with_capacity(1);
 
         if let Ok(render_app) = app.get_sub_app_mut(RenderApp) {
-            if let Some(per_object_buffer_batch_size) = GpuArrayBuffer::<MeshUniform>::batch_size(
-                render_app.world.resource::<RenderDevice>(),
-            ) {
+            let render_device = render_app.world.resource::<RenderDevice>();
+            let batched_instance_buffers =
+                BatchedInstanceBuffers::<MeshUniform, MeshInputUniform>::new(
+                    render_device,
+                    self.using_gpu_uniform_builder,
+                );
+
+            if let Some(per_object_buffer_batch_size) =
+                GpuArrayBuffer::<MeshUniform>::batch_size(render_device)
+            {
                 mesh_bindings_shader_defs.push(ShaderDefVal::UInt(
                     "PER_OBJECT_BUFFER_BATCH_SIZE".into(),
                     per_object_buffer_batch_size,
@@ -159,9 +182,7 @@ impl Plugin for MeshRenderPlugin {
             }
 
             render_app
-                .insert_resource(GpuArrayBuffer::<MeshUniform>::new(
-                    render_app.world.resource::<RenderDevice>(),
-                ))
+                .insert_resource(batched_instance_buffers)
                 .init_resource::<MeshPipeline>();
         }
 
@@ -208,6 +229,16 @@ pub struct MeshUniform {
     pub lightmap_uv_rect: UVec2,
 }
 
+#[derive(ShaderType, Pod, Zeroable, Clone, Copy)]
+#[repr(C)]
+pub struct MeshInputUniform {
+    // Affine 4x3 matrix transposed to 3x4
+    pub transform: [Vec4; 3],
+    pub lightmap_uv_rect: UVec2,
+    pub flags: u32,
+    pub previous_input_index: u32,
+}
+
 impl MeshUniform {
     pub fn new(mesh_transforms: &MeshTransforms, maybe_lightmap_uv_rect: Option<Rect>) -> Self {
         let (inverse_transpose_model_a, inverse_transpose_model_b) =
@@ -237,26 +268,104 @@ bitflags::bitflags! {
     }
 }
 
-pub struct RenderMeshInstance {
+bitflags::bitflags! {
+    #[derive(Clone, Copy)]
+    pub struct RenderMeshInstanceFlags: u8 {
+        const SHADOW_CASTER           = 1 << 0;
+        const AUTOMATIC_BATCHING      = 1 << 1;
+        const HAVE_PREVIOUS_TRANSFORM = 1 << 2;
+    }
+}
+
+#[derive(Deref)]
+pub struct RenderMeshInstanceCpu {
+    #[deref]
+    pub shared: RenderMeshInstanceShared,
     pub transforms: MeshTransforms,
+}
+
+#[derive(Deref)]
+pub struct RenderMeshInstanceGpu {
+    #[deref]
+    pub shared: RenderMeshInstanceShared,
+    pub translation: Vec3,
+    pub current_uniform_index: u32,
+}
+
+pub struct RenderMeshInstanceShared {
     pub mesh_asset_id: AssetId<Mesh>,
     pub material_bind_group_id: AtomicMaterialBindGroupId,
-    pub shadow_caster: bool,
-    pub automatic_batching: bool,
+    pub flags: RenderMeshInstanceFlags,
 }
 
-impl RenderMeshInstance {
+pub struct RenderMeshInstanceGpuBuilder {
+    pub shared: RenderMeshInstanceShared,
+    pub transform: Affine3,
+    pub mesh_flags: MeshFlags,
+}
+
+impl RenderMeshInstanceShared {
     pub fn should_batch(&self) -> bool {
-        self.automatic_batching && self.material_bind_group_id.get().is_some()
+        self.flags
+            .contains(RenderMeshInstanceFlags::AUTOMATIC_BATCHING)
+            && self.material_bind_group_id.get().is_some()
     }
 }
 
-#[derive(Default, Resource, Deref, DerefMut)]
-pub struct RenderMeshInstances(EntityHashMap<RenderMeshInstance>);
+#[derive(Resource)]
+pub enum RenderMeshInstances {
+    CpuBuilding(EntityHashMap<RenderMeshInstanceCpu>),
+    GpuBuilding(EntityHashMap<RenderMeshInstanceGpu>),
+}
 
-pub fn extract_meshes(
+impl RenderMeshInstances {
+    fn new(using_gpu_uniform_builder: bool) -> RenderMeshInstances {
+        if using_gpu_uniform_builder {
+            RenderMeshInstances::GpuBuilding(EntityHashMap::default())
+        } else {
+            RenderMeshInstances::CpuBuilding(EntityHashMap::default())
+        }
+    }
+
+    pub(crate) fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
+        match *self {
+            RenderMeshInstances::CpuBuilding(ref instances) => instances
+                .get(&entity)
+                .map(|instance| instance.mesh_asset_id),
+            RenderMeshInstances::GpuBuilding(ref instances) => instances
+                .get(&entity)
+                .map(|instance| instance.mesh_asset_id),
+        }
+    }
+
+    pub fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+        match *self {
+            RenderMeshInstances::CpuBuilding(ref instances) => {
+                instances.get(&entity).map(|instance| RenderMeshQueueData {
+                    shared: &instance.shared,
+                    translation: instance.transforms.transform.translation,
+                })
+            }
+            RenderMeshInstances::GpuBuilding(ref instances) => {
+                instances.get(&entity).map(|instance| RenderMeshQueueData {
+                    shared: &instance.shared,
+                    translation: instance.translation,
+                })
+            }
+        }
+    }
+}
+
+#[derive(Deref)]
+pub struct RenderMeshQueueData<'a> {
+    #[deref]
+    pub shared: &'a RenderMeshInstanceShared,
+    pub translation: Vec3,
+}
+
+pub fn extract_meshes_for_cpu_building(
     mut render_mesh_instances: ResMut<RenderMeshInstances>,
-    mut thread_local_queues: Local<Parallel<Vec<(Entity, RenderMeshInstance)>>>,
+    mut render_mesh_instance_queues: Local<Parallel<Vec<(Entity, RenderMeshInstanceCpu)>>>,
     meshes_query: Extract<
         Query<(
             Entity,
@@ -287,44 +396,203 @@ pub fn extract_meshes(
                 return;
             }
             let transform = transform.affine();
-            let previous_transform = previous_transform.map(|t| t.0).unwrap_or(transform);
-            let mut flags = if not_shadow_receiver {
+            let mut mesh_flags = if not_shadow_receiver {
                 MeshFlags::empty()
             } else {
                 MeshFlags::SHADOW_RECEIVER
             };
             if transmitted_receiver {
-                flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER;
+                mesh_flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER;
             }
             if transform.matrix3.determinant().is_sign_positive() {
-                flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
+                mesh_flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
             }
-            let transforms = MeshTransforms {
-                transform: (&transform).into(),
-                previous_transform: (&previous_transform).into(),
-                flags: flags.bits(),
-            };
-            thread_local_queues.scope(|queue| {
+
+            let mut mesh_instance_flags = RenderMeshInstanceFlags::empty();
+            mesh_instance_flags.set(RenderMeshInstanceFlags::SHADOW_CASTER, !not_shadow_caster);
+            mesh_instance_flags.set(
+                RenderMeshInstanceFlags::AUTOMATIC_BATCHING,
+                !no_automatic_batching,
+            );
+            mesh_instance_flags.set(
+                RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM,
+                previous_transform.is_some(),
+            );
+
+            render_mesh_instance_queues.scope(|queue| {
+                let shared = RenderMeshInstanceShared {
+                    mesh_asset_id: handle.id(),
+
+                    flags: mesh_instance_flags,
+                    material_bind_group_id: AtomicMaterialBindGroupId::default(),
+                };
+
                 queue.push((
                     entity,
-                    RenderMeshInstance {
-                        mesh_asset_id: handle.id(),
-                        transforms,
-                        shadow_caster: !not_shadow_caster,
-                        material_bind_group_id: AtomicMaterialBindGroupId::default(),
-                        automatic_batching: !no_automatic_batching,
+                    RenderMeshInstanceCpu {
+                        transforms: MeshTransforms {
+                            transform: (&transform).into(),
+                            previous_transform: (&previous_transform
+                                .map(|t| t.0)
+                                .unwrap_or(transform))
+                                .into(),
+                            flags: mesh_flags.bits(),
+                        },
+                        shared,
                     },
                 ));
             });
         },
     );
 
+    // Collect the render mesh instances.
+    let RenderMeshInstances::CpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances
+    else {
+        panic!(
+            "`extract_meshes_for_cpu_building` should only be called if we're using CPU \
+            `MeshUniform` building"
+        );
+    };
+
     render_mesh_instances.clear();
-    for queue in thread_local_queues.iter_mut() {
+    for queue in render_mesh_instance_queues.iter_mut() {
         render_mesh_instances.extend(queue.drain(..));
     }
 }
 
+pub fn extract_meshes_for_gpu_building(
+    mut render_mesh_instances: ResMut<RenderMeshInstances>,
+    mut batched_instance_buffers: ResMut<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    mut render_mesh_instance_queues: Local<Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>>,
+    mut prev_render_mesh_instances: Local<EntityHashMap<RenderMeshInstanceGpu>>,
+    meshes_query: Extract<
+        Query<(
+            Entity,
+            &ViewVisibility,
+            &GlobalTransform,
+            Option<&PreviousGlobalTransform>,
+            &Handle<Mesh>,
+            Has<NotShadowReceiver>,
+            Has<TransmittedShadowReceiver>,
+            Has<NotShadowCaster>,
+            Has<NoAutomaticBatching>,
+        )>,
+    >,
+) {
+    meshes_query.par_iter().for_each(
+        |(
+            entity,
+            view_visibility,
+            transform,
+            previous_transform,
+            handle,
+            not_shadow_receiver,
+            transmitted_receiver,
+            not_shadow_caster,
+            no_automatic_batching,
+        )| {
+            if !view_visibility.get() {
+                return;
+            }
+            let transform = transform.affine();
+            let mut mesh_flags = if not_shadow_receiver {
+                MeshFlags::empty()
+            } else {
+                MeshFlags::SHADOW_RECEIVER
+            };
+            if transmitted_receiver {
+                mesh_flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER;
+            }
+            if transform.matrix3.determinant().is_sign_positive() {
+                mesh_flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
+            }
+
+            let mut mesh_instance_flags = RenderMeshInstanceFlags::empty();
+            mesh_instance_flags.set(RenderMeshInstanceFlags::SHADOW_CASTER, !not_shadow_caster);
+            mesh_instance_flags.set(
+                RenderMeshInstanceFlags::AUTOMATIC_BATCHING,
+                !no_automatic_batching,
+            );
+            mesh_instance_flags.set(
+                RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM,
+                previous_transform.is_some(),
+            );
+
+            render_mesh_instance_queues.scope(|queue| {
+                let shared = RenderMeshInstanceShared {
+                    mesh_asset_id: handle.id(),
+
+                    flags: mesh_instance_flags,
+                    material_bind_group_id: AtomicMaterialBindGroupId::default(),
+                };
+                queue.push((
+                    entity,
+                    RenderMeshInstanceGpuBuilder {
+                        shared,
+                        transform: (&transform).into(),
+                        mesh_flags,
+                    },
+                ));
+            });
+        },
+    );
+
+    // Collect render mesh instances. Build up the uniform buffer.
+    let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances
+    else {
+        panic!(
+            "`extract_meshes_for_gpu_building` should only be called if we're using GPU \
+            `MeshUniform` building"
+        );
+    };
+    let BatchedInstanceBuffers::GpuBuilt {
+        ref mut current_input_buffer,
+        ref mut previous_input_buffer,
+        ..
+    } = *batched_instance_buffers
+    else {
+        unreachable!()
+    };
+
+    // Swap buffers.
+    mem::swap(current_input_buffer, previous_input_buffer);
+    mem::swap(render_mesh_instances, &mut prev_render_mesh_instances);
+
+    render_mesh_instances.clear();
+    for queue in render_mesh_instance_queues.iter_mut() {
+        for (entity, builder) in queue.drain(..) {
+            let previous_input_index = if builder
+                .shared
+                .flags
+                .contains(RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM)
+            {
+                prev_render_mesh_instances
+                    .get(&entity)
+                    .map(|render_mesh_instance| render_mesh_instance.current_uniform_index)
+            } else {
+                None
+            };
+
+            let current_uniform_index = current_input_buffer.push(MeshInputUniform {
+                transform: builder.transform.to_transpose(),
+                // TODO: Track this.
+                lightmap_uv_rect: lightmap::pack_lightmap_uv_rect(None),
+                flags: builder.mesh_flags.bits(),
+                previous_input_index: previous_input_index.unwrap_or(!0),
+            });
+
+            render_mesh_instances.insert(
+                entity,
+                RenderMeshInstanceGpu {
+                    translation: builder.transform.translation,
+                    shared: builder.shared,
+                    current_uniform_index: current_uniform_index as u32,
+                },
+            );
+        }
+    }
+}
+
 #[derive(Resource, Clone)]
 pub struct MeshPipeline {
     view_layouts: [MeshPipelineViewLayout; MeshPipelineViewLayoutKey::COUNT],
@@ -450,10 +718,15 @@ impl GetBatchData for MeshPipeline {
 
     type BufferData = MeshUniform;
 
+    type BufferInputData = MeshInputUniform;
+
     fn get_batch_data(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
+        let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
+            return None;
+        };
         let mesh_instance = mesh_instances.get(&entity)?;
         let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
 
@@ -469,6 +742,26 @@ impl GetBatchData for MeshPipeline {
             )),
         ))
     }
+
+    fn get_batch_index(
+        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        entity: Entity,
+    ) -> Option<(u32, Option<Self::CompareData>)> {
+        let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
+            return None;
+        };
+        let mesh_instance = mesh_instances.get(&entity)?;
+        let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
+
+        Some((
+            mesh_instance.current_uniform_index,
+            mesh_instance.should_batch().then_some((
+                mesh_instance.material_bind_group_id.get(),
+                mesh_instance.mesh_asset_id,
+                maybe_lightmap.map(|lightmap| lightmap.image),
+            )),
+        ))
+    }
 }
 
 bitflags::bitflags! {
@@ -982,14 +1275,14 @@ pub fn prepare_mesh_bind_group(
     mut groups: ResMut<MeshBindGroups>,
     mesh_pipeline: Res<MeshPipeline>,
     render_device: Res<RenderDevice>,
-    mesh_uniforms: Res<GpuArrayBuffer<MeshUniform>>,
+    mesh_uniforms: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
     skins_uniform: Res<SkinUniform>,
     weights_uniform: Res<MorphUniform>,
     render_lightmaps: Res<RenderLightmaps>,
 ) {
     groups.reset();
     let layouts = &mesh_pipeline.mesh_layouts;
-    let Some(model) = mesh_uniforms.binding() else {
+    let Some(model) = mesh_uniforms.uniform_binding() else {
         return;
     };
     groups.model_only = Some(layouts.model_only(&render_device, &model));
@@ -1091,7 +1384,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
 
         let entity = &item.entity();
 
-        let Some(mesh) = mesh_instances.get(entity) else {
+        let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(*entity) else {
             return RenderCommandResult::Success;
         };
         let skin_index = skin_indices.get(entity);
@@ -1105,8 +1398,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
             .get(entity)
             .map(|render_lightmap| render_lightmap.image);
 
-        let Some(bind_group) =
-            bind_groups.get(mesh.mesh_asset_id, lightmap, is_skinned, is_morphed)
+        let Some(bind_group) = bind_groups.get(mesh_asset_id, lightmap, is_skinned, is_morphed)
         else {
             error!(
                 "The MeshBindGroups resource wasn't set in the render phase. \
@@ -1152,10 +1444,10 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         let meshes = meshes.into_inner();
         let mesh_instances = mesh_instances.into_inner();
 
-        let Some(mesh_instance) = mesh_instances.get(&item.entity()) else {
+        let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else {
             return RenderCommandResult::Failure;
         };
-        let Some(gpu_mesh) = meshes.get(mesh_instance.mesh_asset_id) else {
+        let Some(gpu_mesh) = meshes.get(mesh_asset_id) else {
             return RenderCommandResult::Failure;
         };
 
diff --git a/crates/bevy_pbr/src/render/mod.rs b/crates/bevy_pbr/src/render/mod.rs
index 7efffc05681f8..ea5b4158bd946 100644
--- a/crates/bevy_pbr/src/render/mod.rs
+++ b/crates/bevy_pbr/src/render/mod.rs
@@ -1,3 +1,4 @@
+mod build_mesh_uniforms;
 mod fog;
 mod light;
 pub(crate) mod mesh;
@@ -6,6 +7,7 @@ mod mesh_view_bindings;
 mod morph;
 mod skin;
 
+pub use build_mesh_uniforms::*;
 pub use fog::*;
 pub use light::*;
 pub use mesh::*;
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 54b0573081f67..492019eebea9b 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -2,13 +2,17 @@ use bevy_ecs::{
     component::Component,
     entity::Entity,
     prelude::Res,
-    system::{Query, ResMut, StaticSystemParam, SystemParam, SystemParamItem},
+    system::{Query, ResMut, Resource, StaticSystemParam, SystemParam, SystemParamItem},
 };
+use bytemuck::Pod;
 use nonmax::NonMaxU32;
+use wgpu::{BindingResource, BufferUsages};
 
 use crate::{
     render_phase::{CachedRenderPipelinePhaseItem, DrawFunctionId, RenderPhase},
-    render_resource::{CachedRenderPipelineId, GpuArrayBuffer, GpuArrayBufferable},
+    render_resource::{
+        BufferVec, CachedRenderPipelineId, GpuArrayBuffer, GpuArrayBufferable, UninitBufferVec,
+    },
     renderer::{RenderDevice, RenderQueue},
 };
 
@@ -53,6 +57,57 @@ impl<T: PartialEq> BatchMeta<T> {
     }
 }
 
+#[derive(Resource)]
+pub enum BatchedInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    CpuBuilt(GpuArrayBuffer<BD>),
+    GpuBuilt {
+        data_buffer: UninitBufferVec<BD>,
+        index_buffer: BufferVec<u32>,
+        current_input_buffer: BufferVec<BDI>,
+        previous_input_buffer: BufferVec<BDI>,
+        /// The number of indices this frame.
+        ///
+        /// This is different from `index_buffer.len()` because `index_buffer`
+        /// gets cleared during `write_batched_instance_buffer`.
+        index_count: usize,
+    },
+}
+
+impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    pub fn new(render_device: &RenderDevice, using_gpu_uniform_builder: bool) -> Self {
+        if !using_gpu_uniform_builder {
+            return BatchedInstanceBuffers::CpuBuilt(GpuArrayBuffer::new(render_device));
+        }
+
+        BatchedInstanceBuffers::GpuBuilt {
+            data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
+            index_buffer: BufferVec::new(BufferUsages::STORAGE),
+            current_input_buffer: BufferVec::new(BufferUsages::STORAGE),
+            previous_input_buffer: BufferVec::new(BufferUsages::STORAGE),
+            index_count: 0,
+        }
+    }
+
+    pub fn uniform_binding(&self) -> Option<BindingResource> {
+        match *self {
+            BatchedInstanceBuffers::CpuBuilt(ref buffer) => buffer.binding(),
+            BatchedInstanceBuffers::GpuBuilt {
+                ref data_buffer, ..
+            } => data_buffer
+                .buffer()
+                .map(|buffer| buffer.as_entire_binding()),
+        }
+    }
+}
+
 /// A trait to support getting data used for batching draw commands via phase
 /// items.
 pub trait GetBatchData {
@@ -64,6 +119,7 @@ pub trait GetBatchData {
     /// The per-instance data to be inserted into the [`GpuArrayBuffer`]
     /// containing these data for all instances.
     type BufferData: GpuArrayBufferable + Sync + Send + 'static;
+    type BufferInputData: Pod + Sync + Send;
     /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
     /// If the instance can be batched, also return the data used for
     /// comparison when deciding whether draws can be batched, else return None
@@ -72,25 +128,55 @@ pub trait GetBatchData {
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)>;
+    /// Same as the above, but for GPU uniform building.
+    fn get_batch_index(
+        param: &SystemParamItem<Self::Param>,
+        query_item: Entity,
+    ) -> Option<(u32, Option<Self::CompareData>)>;
 }
 
 /// Batch the items in a render phase. This means comparing metadata needed to draw each phase item
 /// and trying to combine the draws into a batch.
-pub fn batch_and_prepare_render_phase<I: CachedRenderPipelinePhaseItem, F: GetBatchData>(
-    gpu_array_buffer: ResMut<GpuArrayBuffer<F::BufferData>>,
+pub fn batch_and_prepare_render_phase<I, F>(
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
     mut views: Query<&mut RenderPhase<I>>,
     param: StaticSystemParam<F::Param>,
-) {
+) where
+    I: CachedRenderPipelinePhaseItem,
+    F: GetBatchData,
+{
     let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
     let mut process_item = |item: &mut I| {
-        let (buffer_data, compare_data) = F::get_batch_data(&system_param_item, item.entity())?;
-        let buffer_index = gpu_array_buffer.push(buffer_data);
+        let compare_data = match gpu_array_buffer {
+            BatchedInstanceBuffers::CpuBuilt(ref mut buffer) => {
+                let (buffer_data, compare_data) =
+                    F::get_batch_data(&system_param_item, item.entity())?;
+                let buffer_index = buffer.push(buffer_data);
+
+                let index = buffer_index.index;
+                *item.batch_range_mut() = index..index + 1;
+                *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
 
-        let index = buffer_index.index;
-        *item.batch_range_mut() = index..index + 1;
-        *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
+                compare_data
+            }
+
+            BatchedInstanceBuffers::GpuBuilt {
+                index_buffer,
+                data_buffer,
+                ..
+            } => {
+                let (batch_index, compare_data) =
+                    F::get_batch_index(&system_param_item, item.entity())?;
+                let index_buffer_index = index_buffer.push(batch_index) as u32;
+                let data_buffer_index = data_buffer.add() as u32;
+                debug_assert_eq!(index_buffer_index, data_buffer_index);
+                *item.batch_range_mut() = data_buffer_index..data_buffer_index + 1;
+
+                compare_data
+            }
+        };
 
         if I::AUTOMATIC_BATCHING {
             compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
@@ -118,9 +204,31 @@ pub fn batch_and_prepare_render_phase<I: CachedRenderPipelinePhaseItem, F: GetBa
 pub fn write_batched_instance_buffer<F: GetBatchData>(
     render_device: Res<RenderDevice>,
     render_queue: Res<RenderQueue>,
-    gpu_array_buffer: ResMut<GpuArrayBuffer<F::BufferData>>,
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
 ) {
     let gpu_array_buffer = gpu_array_buffer.into_inner();
-    gpu_array_buffer.write_buffer(&render_device, &render_queue);
-    gpu_array_buffer.clear();
+    match gpu_array_buffer {
+        BatchedInstanceBuffers::CpuBuilt(ref mut gpu_array_buffer) => {
+            gpu_array_buffer.write_buffer(&render_device, &render_queue);
+            gpu_array_buffer.clear();
+        }
+        BatchedInstanceBuffers::GpuBuilt {
+            ref mut data_buffer,
+            ref mut index_buffer,
+            ref mut current_input_buffer,
+            ref mut index_count,
+            previous_input_buffer: _,
+        } => {
+            data_buffer.write_buffer(&render_device);
+            index_buffer.write_buffer(&render_device, &render_queue);
+            *index_count = index_buffer.len();
+            current_input_buffer.write_buffer(&render_device, &render_queue);
+            // There's no need to write `previous_input_buffer`, as we wrote
+            // that on the previous frame, and it hasn't changed.
+
+            data_buffer.clear();
+            index_buffer.clear();
+            current_input_buffer.clear();
+        }
+    }
 }
diff --git a/crates/bevy_render/src/maths.wgsl b/crates/bevy_render/src/maths.wgsl
index 17d045154a317..757f84d1e56b1 100644
--- a/crates/bevy_render/src/maths.wgsl
+++ b/crates/bevy_render/src/maths.wgsl
@@ -27,3 +27,21 @@ fn mat2x4_f32_to_mat3x3_unpack(
         vec3<f32>(a[1].zw, b),
     );
 }
+
+fn affine3_to_mat3x3(affine: mat4x3<f32>) -> mat3x3<f32> {
+    return mat3x3<f32>(affine[0].xyz, affine[1].xyz, affine[2].xyz);
+}
+
+fn inverse_mat3x3(matrix: mat3x3<f32>) -> mat3x3<f32> {
+    let tmp0 = cross(matrix[1], matrix[2]);
+    let tmp1 = cross(matrix[2], matrix[0]);
+    let tmp2 = cross(matrix[0], matrix[1]);
+    let inv_det = 1.0 / dot(matrix[2], tmp2);
+    return transpose(mat3x3<f32>(tmp0 * inv_det, tmp1 * inv_det, tmp2 * inv_det));
+}
+
+fn inverse_affine3(affine: mat4x3<f32>) -> mat4x3<f32> {
+    let matrix3 = affine3_to_mat3x3(affine);
+    let inv_matrix3 = inverse_mat3x3(matrix3);
+    return mat4x3<f32>(inv_matrix3[0], inv_matrix3[1], inv_matrix3[2], -(matrix3 * affine[3]));
+}
diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs
index 26656c58ed983..bb8aa06ada3a0 100644
--- a/crates/bevy_render/src/render_resource/buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/buffer_vec.rs
@@ -1,3 +1,5 @@
+use std::marker::PhantomData;
+
 use crate::{
     render_resource::Buffer,
     renderer::{RenderDevice, RenderQueue},
@@ -5,6 +7,8 @@ use crate::{
 use bytemuck::{cast_slice, Pod};
 use wgpu::BufferUsages;
 
+use super::GpuArrayBufferable;
+
 /// A structure for storing raw bytes that have already been properly formatted
 /// for use by the GPU.
 ///
@@ -160,3 +164,77 @@ impl<T: Pod> Extend<T> for BufferVec<T> {
         self.values.extend(iter);
     }
 }
+
+pub struct UninitBufferVec<T>
+where
+    T: GpuArrayBufferable,
+{
+    buffer: Option<Buffer>,
+    len: usize,
+    capacity: usize,
+    item_size: usize,
+    buffer_usage: BufferUsages,
+    label: Option<String>,
+    label_changed: bool,
+    phantom: PhantomData<T>,
+}
+
+impl<T> UninitBufferVec<T>
+where
+    T: GpuArrayBufferable,
+{
+    pub const fn new(buffer_usage: BufferUsages) -> Self {
+        Self {
+            len: 0,
+            buffer: None,
+            capacity: 0,
+            item_size: std::mem::size_of::<T>(),
+            buffer_usage,
+            label: None,
+            label_changed: false,
+            phantom: PhantomData,
+        }
+    }
+
+    #[inline]
+    pub fn buffer(&self) -> Option<&Buffer> {
+        self.buffer.as_ref()
+    }
+
+    pub fn add(&mut self) -> usize {
+        let index = self.len;
+        self.len += 1;
+        index
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    pub fn clear(&mut self) {
+        self.len = 0;
+    }
+
+    pub fn reserve(&mut self, capacity: usize, device: &RenderDevice) {
+        if capacity <= self.capacity && !self.label_changed {
+            return;
+        }
+
+        self.capacity = capacity;
+        let size = self.item_size * capacity;
+        self.buffer = Some(device.create_buffer(&wgpu::BufferDescriptor {
+            label: self.label.as_deref(),
+            size: size as wgpu::BufferAddress,
+            usage: BufferUsages::COPY_DST | self.buffer_usage,
+            mapped_at_creation: false,
+        }));
+
+        self.label_changed = false;
+    }
+
+    pub fn write_buffer(&mut self, device: &RenderDevice) {
+        if !self.is_empty() {
+            self.reserve(self.len, device);
+        }
+    }
+}
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 5cd5fcb894374..00fcf4b695ebe 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -11,6 +11,7 @@ use bevy_ecs::{
 };
 use bevy_math::{Affine3, Vec4};
 use bevy_reflect::{std_traits::ReflectDefault, Reflect};
+use bevy_render::batching::BatchedInstanceBuffers;
 use bevy_render::mesh::MeshVertexBufferLayoutRef;
 use bevy_render::{
     batching::{
@@ -32,6 +33,8 @@ use bevy_render::{
     Extract, ExtractSchedule, Render, RenderApp, RenderSet,
 };
 use bevy_transform::components::GlobalTransform;
+use bevy_utils::tracing::error;
+use bytemuck::{Pod, Zeroable};
 
 use crate::Material2dBindGroupId;
 
@@ -116,9 +119,15 @@ impl Plugin for Mesh2dRenderPlugin {
         let mut mesh_bindings_shader_defs = Vec::with_capacity(1);
 
         if let Ok(render_app) = app.get_sub_app_mut(RenderApp) {
-            if let Some(per_object_buffer_batch_size) = GpuArrayBuffer::<Mesh2dUniform>::batch_size(
-                render_app.world.resource::<RenderDevice>(),
-            ) {
+            let render_device = render_app.world.resource::<RenderDevice>();
+            let batched_instance_buffers = BatchedInstanceBuffers::<Mesh2dUniform, ()>::new(
+                render_device,
+                /*using_gpu_uniform_building=*/ false,
+            );
+
+            if let Some(per_object_buffer_batch_size) =
+                GpuArrayBuffer::<Mesh2dUniform>::batch_size(render_device)
+            {
                 mesh_bindings_shader_defs.push(ShaderDefVal::UInt(
                     "PER_OBJECT_BUFFER_BATCH_SIZE".into(),
                     per_object_buffer_batch_size,
@@ -126,9 +135,7 @@ impl Plugin for Mesh2dRenderPlugin {
             }
 
             render_app
-                .insert_resource(GpuArrayBuffer::<Mesh2dUniform>::new(
-                    render_app.world.resource::<RenderDevice>(),
-                ))
+                .insert_resource(batched_instance_buffers)
                 .init_resource::<Mesh2dPipeline>();
         }
 
@@ -150,7 +157,8 @@ pub struct Mesh2dTransforms {
     pub flags: u32,
 }
 
-#[derive(ShaderType, Clone)]
+#[derive(ShaderType, Clone, Pod, Copy, Zeroable)]
+#[repr(C)]
 pub struct Mesh2dUniform {
     // Affine 4x3 matrix transposed to 3x4
     pub transform: [Vec4; 3],
@@ -161,6 +169,7 @@ pub struct Mesh2dUniform {
     pub inverse_transpose_model_a: [Vec4; 2],
     pub inverse_transpose_model_b: f32,
     pub flags: u32,
+    pub pad: [u32; 2],
 }
 
 impl From<&Mesh2dTransforms> for Mesh2dUniform {
@@ -172,6 +181,7 @@ impl From<&Mesh2dTransforms> for Mesh2dUniform {
             inverse_transpose_model_a,
             inverse_transpose_model_b,
             flags: mesh_transforms.flags,
+            pad: [0; 2],
         }
     }
 }
@@ -342,6 +352,7 @@ impl GetBatchData for Mesh2dPipeline {
     type Param = SRes<RenderMesh2dInstances>;
     type CompareData = (Material2dBindGroupId, AssetId<Mesh>);
     type BufferData = Mesh2dUniform;
+    type BufferInputData = ();
 
     fn get_batch_data(
         mesh_instances: &SystemParamItem<Self::Param>,
@@ -356,6 +367,14 @@ impl GetBatchData for Mesh2dPipeline {
             )),
         ))
     }
+
+    fn get_batch_index(
+        _: &SystemParamItem<Self::Param>,
+        _: Entity,
+    ) -> Option<(u32, Option<Self::CompareData>)> {
+        error!("Attempted to build 2D mesh uniforms on GPU, which is currently unsupported");
+        None
+    }
 }
 
 bitflags::bitflags! {
@@ -571,9 +590,9 @@ pub fn prepare_mesh2d_bind_group(
     mut commands: Commands,
     mesh2d_pipeline: Res<Mesh2dPipeline>,
     render_device: Res<RenderDevice>,
-    mesh2d_uniforms: Res<GpuArrayBuffer<Mesh2dUniform>>,
+    mesh2d_uniforms: Res<BatchedInstanceBuffers<Mesh2dUniform, ()>>,
 ) {
-    if let Some(binding) = mesh2d_uniforms.binding() {
+    if let Some(binding) = mesh2d_uniforms.uniform_binding() {
         commands.insert_resource(Mesh2dBindGroup {
             value: render_device.create_bind_group(
                 "mesh2d_bind_group",
diff --git a/examples/shader/shader_instancing.rs b/examples/shader/shader_instancing.rs
index 47c7b6e4bac0d..2f1eefe0f1098 100644
--- a/examples/shader/shader_instancing.rs
+++ b/examples/shader/shader_instancing.rs
@@ -127,7 +127,7 @@ fn queue_custom(
         let view_key = msaa_key | MeshPipelineKey::from_hdr(view.hdr);
         let rangefinder = view.rangefinder3d();
         for entity in &material_meshes {
-            let Some(mesh_instance) = render_mesh_instances.get(&entity) else {
+            let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(entity) else {
                 continue;
             };
             let Some(mesh) = meshes.get(mesh_instance.mesh_asset_id) else {
@@ -141,8 +141,7 @@ fn queue_custom(
                 entity,
                 pipeline,
                 draw_function: draw_custom,
-                distance: rangefinder
-                    .distance_translation(&mesh_instance.transforms.transform.translation),
+                distance: rangefinder.distance_translation(&mesh_instance.translation),
                 batch_range: 0..1,
                 dynamic_offset: None,
             });
@@ -245,7 +244,8 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMeshInstanced {
         (meshes, render_mesh_instances): SystemParamItem<'w, '_, Self::Param>,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
-        let Some(mesh_instance) = render_mesh_instances.get(&item.entity()) else {
+        let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(item.entity())
+        else {
             return RenderCommandResult::Failure;
         };
         let Some(gpu_mesh) = meshes.into_inner().get(mesh_instance.mesh_asset_id) else {

From 3e5e0954599df6eaa586e6f10624ab96476ca9a9 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Fri, 29 Mar 2024 18:38:28 -0700
Subject: [PATCH 02/39] Add more documentation and disable on WebGL

---
 crates/bevy_pbr/src/lib.rs                    |  12 +-
 crates/bevy_pbr/src/prepass/mod.rs            |  10 +-
 .../src/render/build_mesh_uniforms.rs         |  34 ++-
 crates/bevy_pbr/src/render/mesh.rs            | 237 +++++++++++++-----
 crates/bevy_render/src/batching/mod.rs        |  69 ++++-
 crates/bevy_render/src/maths.wgsl             |   7 +
 .../src/render_resource/buffer_vec.rs         |  19 ++
 crates/bevy_sprite/src/mesh2d/mesh.rs         |   5 +-
 8 files changed, 317 insertions(+), 76 deletions(-)

diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index a5ed6f5588637..ad4f1024836ba 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -79,6 +79,7 @@ pub mod graph {
         /// Label for the screen space ambient occlusion render node.
         ScreenSpaceAmbientOcclusion,
         DeferredLightingPass,
+        /// Label for the compute shader mesh uniforms building pass.
         BuildMeshUniforms,
     }
 }
@@ -136,6 +137,8 @@ pub struct PbrPlugin {
     /// Controls if [`DeferredPbrLightingPlugin`] is added.
     pub add_default_deferred_lighting_plugin: bool,
     /// Controls if GPU [`MeshUniform`] building is enabled.
+    ///
+    /// This requires compute shader support.
     pub using_gpu_uniform_builder: bool,
 }
 
@@ -144,7 +147,14 @@ impl Default for PbrPlugin {
         Self {
             prepass_enabled: true,
             add_default_deferred_lighting_plugin: true,
-            using_gpu_uniform_builder: true,
+
+            // The GPU uniform builder requires compute shaders, which aren't
+            // available on any version of WebGL.
+            using_gpu_uniform_builder: cfg!(any(
+                feature = "webgpu",
+                not(feature = "webgl"),
+                not(target_arch = "wasm32"),
+            )),
         }
     }
 }
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index f0ce0d7c31b58..d12a7e67f95ff 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -156,14 +156,8 @@ where
                     Render,
                     (
                         prepare_previous_view_projection_uniforms,
-                        batch_and_prepare_render_phase::<
-                            Opaque3dPrepass,
-                            MeshPipeline,
-                        >,
-                        batch_and_prepare_render_phase::<
-                            AlphaMask3dPrepass,
-                            MeshPipeline,
-                        >,
+                        batch_and_prepare_render_phase::<Opaque3dPrepass, MeshPipeline>,
+                        batch_and_prepare_render_phase::<AlphaMask3dPrepass, MeshPipeline>,
                     )
                         .in_set(RenderSet::PrepareResources),
                 );
diff --git a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
index 4db04f516a9fb..34908db9bc0c3 100644
--- a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
+++ b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
@@ -1,4 +1,10 @@
-//! Build mesh uniforms.
+//! GPU mesh uniform building.
+//!
+//! This is an optional pass that uses a compute shader to reduce the amount of
+//! data that has to be transferred from the CPU to the GPU. When enabled,
+//! instead of transferring [`MeshUniform`]s to the GPU, we transfer the smaller
+//! [`MeshInputUniform`]s instead and use the GPU to calculate the remaining
+//! derived fields in [`MeshUniform`].
 
 use bevy_app::{App, Plugin};
 use bevy_asset::{load_internal_asset, Handle};
@@ -21,23 +27,34 @@ use bevy_render::{
     renderer::{RenderContext, RenderDevice},
     Render, RenderApp, RenderSet,
 };
-use bevy_utils::tracing::warn;
+use bevy_utils::tracing::{error, warn};
 
 use crate::{graph::NodePbr, MeshInputUniform, MeshUniform};
 
+/// The handle to the `build_mesh_uniforms.wgsl` compute shader.
 pub const BUILD_MESH_UNIFORMS_SHADER_HANDLE: Handle<Shader> =
     Handle::weak_from_u128(16991728318640779533);
 
+/// The GPU workgroup size.
 const WORKGROUP_SIZE: usize = 64;
 
+/// A plugin that builds mesh uniforms on GPU.
+///
+/// This will only be added if the platform supports compute shaders (e.g. not
+/// on WebGL 2).
 pub struct BuildMeshUniformsPlugin;
 
+/// The render node for the mesh uniform building pass.
 #[derive(Default)]
 pub struct BuildMeshUniformsNode;
 
+/// The compute shader pipeline for the mesh uniform building pass.
 #[derive(Resource)]
 pub struct BuildMeshUniformsPipeline {
+    /// The single bind group layout for the compute shader.
     pub bind_group_layout: BindGroupLayout,
+    /// The pipeline ID for the compute shader.
+    ///
     /// This gets filled in in `prepare_build_mesh_uniforms_pipeline`.
     pub pipeline_id: Option<CachedComputePipelineId>,
 }
@@ -66,6 +83,7 @@ impl Plugin for BuildMeshUniformsPlugin {
             return;
         };
 
+        // Stitch the node in.
         render_app
             .add_render_graph_node::<ViewNodeRunner<BuildMeshUniformsNode>>(
                 Core3d,
@@ -94,6 +112,7 @@ impl ViewNode for BuildMeshUniformsNode {
         _: QueryItem<'w, Self::ViewQuery>,
         world: &'w World,
     ) -> Result<(), NodeRunError> {
+        // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh uniform
         let BatchedInstanceBuffers::GpuBuilt {
             data_buffer: ref data_buffer_vec,
             index_buffer: ref index_buffer_vec,
@@ -102,6 +121,10 @@ impl ViewNode for BuildMeshUniformsNode {
             index_count,
         } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
         else {
+            error!(
+                "Attempted to build mesh uniforms on GPU, but `GpuBuilt` batched instance \
+                buffers weren't available"
+            );
             return Ok(());
         };
 
@@ -188,9 +211,13 @@ impl FromWorld for BuildMeshUniformsPipeline {
         let bind_group_layout_entries = DynamicBindGroupLayoutEntries::sequential(
             ShaderStages::COMPUTE,
             (
+                // `current_input`
                 storage_buffer_read_only::<MeshInputUniform>(/*has_dynamic_offset=*/ false),
+                // `previous_input`
                 storage_buffer_read_only::<MeshInputUniform>(/*has_dynamic_offset=*/ false),
+                // `indices`
                 storage_buffer_read_only::<u32>(/*has_dynamic_offset=*/ false),
+                // `output`
                 storage_buffer::<MeshUniform>(/*has_dynamic_offset=*/ false),
             ),
         );
@@ -207,6 +234,8 @@ impl FromWorld for BuildMeshUniformsPipeline {
     }
 }
 
+/// A system that specializes the `build_mesh_uniforms.wgsl` pipeline if
+/// necessary.
 pub fn prepare_build_mesh_uniforms_pipeline(
     pipeline_cache: Res<PipelineCache>,
     mut pipelines: ResMut<SpecializedComputePipelines<BuildMeshUniformsPipeline>>,
@@ -221,6 +250,7 @@ pub fn prepare_build_mesh_uniforms_pipeline(
     build_mesh_uniforms_pipeline.pipeline_id = Some(build_mesh_uniforms_pipeline_id);
 }
 
+/// Returns `a / b`, rounded toward positive infinity.
 fn div_round_up(a: usize, b: usize) -> usize {
     (a + b - 1) / b
 }
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index d009e0f99bfca..809ebbba3f9fb 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -46,7 +46,11 @@ use self::irradiance_volume::IRRADIANCE_VOLUMES_ARE_USABLE;
 
 use super::skin::SkinIndices;
 
+/// Provides support for rendering PBR meshes.
 pub struct MeshRenderPlugin {
+    /// Whether we're building [`MeshUniform`]s on GPU.
+    ///
+    /// If this is false, we're building them on CPU.
     pub using_gpu_uniform_builder: bool,
 }
 
@@ -229,13 +233,31 @@ pub struct MeshUniform {
     pub lightmap_uv_rect: UVec2,
 }
 
+/// Information that has to be transferred from CPU to GPU in order to produce
+/// the full [`MeshUniform`].
+///
+/// This is essentially a subset of the fields in [`MeshUniform`] above.
 #[derive(ShaderType, Pod, Zeroable, Clone, Copy)]
 #[repr(C)]
 pub struct MeshInputUniform {
-    // Affine 4x3 matrix transposed to 3x4
+    /// Affine 4x3 matrix transposed to 3x4.
     pub transform: [Vec4; 3],
+    /// Four 16-bit unsigned normalized UV values packed into a `UVec2`:
+    ///
+    ///                         <--- MSB                   LSB --->
+    ///                         +---- min v ----+ +---- min u ----+
+    ///     lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu,
+    ///                         +---- max v ----+ +---- max u ----+
+    ///     lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU,
+    ///
+    /// (MSB: most significant bit; LSB: least significant bit.)
     pub lightmap_uv_rect: UVec2,
+    /// Various [`MeshFlags`].
     pub flags: u32,
+    /// The index of this mesh's [`MeshInputUniform`] in the previous frame's
+    /// buffer, if applicable.
+    ///
+    /// This is used for TAA. If not present, this will be `!0`.
     pub previous_input_index: u32,
 }
 
@@ -268,43 +290,126 @@ bitflags::bitflags! {
     }
 }
 
+impl MeshFlags {
+    fn from_components(
+        transform: &GlobalTransform,
+        not_shadow_receiver: bool,
+        transmitted_receiver: bool,
+    ) -> MeshFlags {
+        let mut mesh_flags = if not_shadow_receiver {
+            MeshFlags::empty()
+        } else {
+            MeshFlags::SHADOW_RECEIVER
+        };
+        if transmitted_receiver {
+            mesh_flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER;
+        }
+        if transform.affine().matrix3.determinant().is_sign_positive() {
+            mesh_flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
+        }
+
+        mesh_flags
+    }
+}
+
 bitflags::bitflags! {
+    /// Various useful flags for [`RenderMeshInstance`]s.
     #[derive(Clone, Copy)]
     pub struct RenderMeshInstanceFlags: u8 {
+        /// The mesh casts shadows.
         const SHADOW_CASTER           = 1 << 0;
+        /// The mesh can participate in automatic batching.
         const AUTOMATIC_BATCHING      = 1 << 1;
+        /// The mesh had a transform last frame and so is eligible for TAA.
         const HAVE_PREVIOUS_TRANSFORM = 1 << 2;
     }
 }
 
+/// CPU data that the render world keeps for each entity, when *not* using GPU
+/// mesh uniform building.
 #[derive(Deref)]
 pub struct RenderMeshInstanceCpu {
+    /// Data shared between both the CPU mesh uniform building and the GPU mesh
+    /// uniform building paths.
     #[deref]
     pub shared: RenderMeshInstanceShared,
+    /// The transform of the mesh.
+    ///
+    /// This will be written into the [`MeshUniform`] at the appropriate time.
     pub transforms: MeshTransforms,
 }
 
+/// CPU data that the render world needs to keep for each entity that contains a
+/// mesh when using GPU mesh uniform building.
 #[derive(Deref)]
 pub struct RenderMeshInstanceGpu {
+    /// Data shared between both the CPU mesh uniform building and the GPU mesh
+    /// uniform building paths.
     #[deref]
     pub shared: RenderMeshInstanceShared,
+    /// The translation of the mesh.
+    ///
+    /// This is the only part of the transform that we have to keep on CPU (for
+    /// distance sorting).
     pub translation: Vec3,
+    /// The index of the [`MeshInputUniform`] in the buffer.
     pub current_uniform_index: u32,
 }
 
+/// CPU data that the render world needs to keep about each entity that contains
+/// a mesh.
 pub struct RenderMeshInstanceShared {
+    /// The [`AssetId`] of the mesh.
     pub mesh_asset_id: AssetId<Mesh>,
+    /// A slot for the material bind group ID.
+    ///
+    /// This is filled in during [`crate::material::queue_material_meshes`].
     pub material_bind_group_id: AtomicMaterialBindGroupId,
+    /// Various flags.
     pub flags: RenderMeshInstanceFlags,
 }
 
+/// Information that is gathered during the parallel portion of mesh extraction
+/// when GPU mesh uniform building is enabled.
+///
+/// From this, the [`MeshInputUniform`] and [`RenderMeshInstance`] are prepared.
 pub struct RenderMeshInstanceGpuBuilder {
+    /// Data that will be placed on the [`RenderMeshInstance`].
     pub shared: RenderMeshInstanceShared,
+    /// The current transform.
     pub transform: Affine3,
+    /// Various flags.
     pub mesh_flags: MeshFlags,
 }
 
 impl RenderMeshInstanceShared {
+    fn from_components(
+        previous_transform: Option<&PreviousGlobalTransform>,
+        handle: &Handle<Mesh>,
+        not_shadow_caster: bool,
+        no_automatic_batching: bool,
+    ) -> Self {
+        let mut mesh_instance_flags = RenderMeshInstanceFlags::empty();
+        mesh_instance_flags.set(RenderMeshInstanceFlags::SHADOW_CASTER, !not_shadow_caster);
+        mesh_instance_flags.set(
+            RenderMeshInstanceFlags::AUTOMATIC_BATCHING,
+            !no_automatic_batching,
+        );
+        mesh_instance_flags.set(
+            RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM,
+            previous_transform.is_some(),
+        );
+
+        RenderMeshInstanceShared {
+            mesh_asset_id: handle.id(),
+
+            flags: mesh_instance_flags,
+            material_bind_group_id: AtomicMaterialBindGroupId::default(),
+        }
+    }
+
+    /// Returns true if this entity is eligible to participate in automatic
+    /// batching.
     pub fn should_batch(&self) -> bool {
         self.flags
             .contains(RenderMeshInstanceFlags::AUTOMATIC_BATCHING)
@@ -312,9 +417,16 @@ impl RenderMeshInstanceShared {
     }
 }
 
+/// Information that the render world keeps about each entity that contains a
+/// mesh.
+///
+/// The set of information needed is different depending on whether CPU or GPU
+/// [`MeshUniform`] building is in use.
 #[derive(Resource)]
 pub enum RenderMeshInstances {
+    /// Information needed when using CPU mesh uniform building.
     CpuBuilding(EntityHashMap<RenderMeshInstanceCpu>),
+    /// Information needed when using GPU mesh uniform building.
     GpuBuilding(EntityHashMap<RenderMeshInstanceGpu>),
 }
 
@@ -327,6 +439,7 @@ impl RenderMeshInstances {
         }
     }
 
+    /// Returns the ID of the mesh asset attached to the given entity, if any.
     pub(crate) fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
         match *self {
             RenderMeshInstances::CpuBuilding(ref instances) => instances
@@ -338,6 +451,8 @@ impl RenderMeshInstances {
         }
     }
 
+    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
+    /// mesh attached.
     pub fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
         match *self {
             RenderMeshInstances::CpuBuilding(ref instances) => {
@@ -356,13 +471,22 @@ impl RenderMeshInstances {
     }
 }
 
+/// Data that [`crate::material::queue_material_meshes`] and similar systems
+/// need in order to place entities that contain meshes in the right batch.
 #[derive(Deref)]
 pub struct RenderMeshQueueData<'a> {
+    /// General information about the mesh instance.
     #[deref]
     pub shared: &'a RenderMeshInstanceShared,
+    /// The translation of the mesh instance.
     pub translation: Vec3,
 }
 
+/// Extracts meshes from the main world into the render world, populating the
+/// [`RenderMeshInstances`].
+///
+/// This is the variant of the system that runs when we're *not* using GPU
+/// [`MeshUniform`] building.
 pub fn extract_meshes_for_cpu_building(
     mut render_mesh_instances: ResMut<RenderMeshInstances>,
     mut render_mesh_instance_queues: Local<Parallel<Vec<(Entity, RenderMeshInstanceCpu)>>>,
@@ -395,38 +519,19 @@ pub fn extract_meshes_for_cpu_building(
             if !view_visibility.get() {
                 return;
             }
-            let transform = transform.affine();
-            let mut mesh_flags = if not_shadow_receiver {
-                MeshFlags::empty()
-            } else {
-                MeshFlags::SHADOW_RECEIVER
-            };
-            if transmitted_receiver {
-                mesh_flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER;
-            }
-            if transform.matrix3.determinant().is_sign_positive() {
-                mesh_flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
-            }
 
-            let mut mesh_instance_flags = RenderMeshInstanceFlags::empty();
-            mesh_instance_flags.set(RenderMeshInstanceFlags::SHADOW_CASTER, !not_shadow_caster);
-            mesh_instance_flags.set(
-                RenderMeshInstanceFlags::AUTOMATIC_BATCHING,
-                !no_automatic_batching,
-            );
-            mesh_instance_flags.set(
-                RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM,
-                previous_transform.is_some(),
+            let mesh_flags =
+                MeshFlags::from_components(transform, not_shadow_receiver, transmitted_receiver);
+
+            let shared = RenderMeshInstanceShared::from_components(
+                previous_transform,
+                handle,
+                not_shadow_caster,
+                no_automatic_batching,
             );
 
             render_mesh_instance_queues.scope(|queue| {
-                let shared = RenderMeshInstanceShared {
-                    mesh_asset_id: handle.id(),
-
-                    flags: mesh_instance_flags,
-                    material_bind_group_id: AtomicMaterialBindGroupId::default(),
-                };
-
+                let transform = transform.affine();
                 queue.push((
                     entity,
                     RenderMeshInstanceCpu {
@@ -460,6 +565,11 @@ pub fn extract_meshes_for_cpu_building(
     }
 }
 
+/// Extracts meshes from the main world into the render world and queues
+/// [`MeshInputUniform`]s to be uploaded to the GPU.
+///
+/// This is the variant of the system that runs when we're using GPU
+/// [`MeshUniform`] building.
 pub fn extract_meshes_for_gpu_building(
     mut render_mesh_instances: ResMut<RenderMeshInstances>,
     mut batched_instance_buffers: ResMut<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
@@ -494,42 +604,23 @@ pub fn extract_meshes_for_gpu_building(
             if !view_visibility.get() {
                 return;
             }
-            let transform = transform.affine();
-            let mut mesh_flags = if not_shadow_receiver {
-                MeshFlags::empty()
-            } else {
-                MeshFlags::SHADOW_RECEIVER
-            };
-            if transmitted_receiver {
-                mesh_flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER;
-            }
-            if transform.matrix3.determinant().is_sign_positive() {
-                mesh_flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
-            }
 
-            let mut mesh_instance_flags = RenderMeshInstanceFlags::empty();
-            mesh_instance_flags.set(RenderMeshInstanceFlags::SHADOW_CASTER, !not_shadow_caster);
-            mesh_instance_flags.set(
-                RenderMeshInstanceFlags::AUTOMATIC_BATCHING,
-                !no_automatic_batching,
-            );
-            mesh_instance_flags.set(
-                RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM,
-                previous_transform.is_some(),
+            let mesh_flags =
+                MeshFlags::from_components(transform, not_shadow_receiver, transmitted_receiver);
+
+            let shared = RenderMeshInstanceShared::from_components(
+                previous_transform,
+                handle,
+                not_shadow_caster,
+                no_automatic_batching,
             );
 
             render_mesh_instance_queues.scope(|queue| {
-                let shared = RenderMeshInstanceShared {
-                    mesh_asset_id: handle.id(),
-
-                    flags: mesh_instance_flags,
-                    material_bind_group_id: AtomicMaterialBindGroupId::default(),
-                };
                 queue.push((
                     entity,
                     RenderMeshInstanceGpuBuilder {
                         shared,
-                        transform: (&transform).into(),
+                        transform: (&transform.affine()).into(),
                         mesh_flags,
                     },
                 ));
@@ -537,14 +628,31 @@ pub fn extract_meshes_for_gpu_building(
         },
     );
 
+    collect_meshes_for_gpu_building(
+        &mut render_mesh_instances,
+        &mut batched_instance_buffers,
+        &mut render_mesh_instance_queues,
+        &mut prev_render_mesh_instances,
+    );
+}
+
+/// Creates the [`RenderMeshInstance`]s and [`MeshInputUniform`]s when GPU mesh
+/// uniforms are built.
+fn collect_meshes_for_gpu_building(
+    render_mesh_instances: &mut RenderMeshInstances,
+    batched_instance_buffers: &mut BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
+    render_mesh_instance_queues: &mut Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>,
+    prev_render_mesh_instances: &mut EntityHashMap<RenderMeshInstanceGpu>,
+) {
     // Collect render mesh instances. Build up the uniform buffer.
     let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances
     else {
         panic!(
-            "`extract_meshes_for_gpu_building` should only be called if we're using GPU \
-            `MeshUniform` building"
+            "`collect_render_mesh_instances_for_gpu_building` should only be called if we're \
+            using GPU `MeshUniform` building"
         );
     };
+
     let BatchedInstanceBuffers::GpuBuilt {
         ref mut current_input_buffer,
         ref mut previous_input_buffer,
@@ -556,8 +664,9 @@ pub fn extract_meshes_for_gpu_building(
 
     // Swap buffers.
     mem::swap(current_input_buffer, previous_input_buffer);
-    mem::swap(render_mesh_instances, &mut prev_render_mesh_instances);
+    mem::swap(render_mesh_instances, prev_render_mesh_instances);
 
+    // Build the [`RenderMeshInstance`]s and [`MeshInputUniform`]s.
     render_mesh_instances.clear();
     for queue in render_mesh_instance_queues.iter_mut() {
         for (entity, builder) in queue.drain(..) {
@@ -573,6 +682,7 @@ pub fn extract_meshes_for_gpu_building(
                 None
             };
 
+            // Push the mesh input uniform.
             let current_uniform_index = current_input_buffer.push(MeshInputUniform {
                 transform: builder.transform.to_transpose(),
                 // TODO: Track this.
@@ -581,6 +691,7 @@ pub fn extract_meshes_for_gpu_building(
                 previous_input_index: previous_input_index.unwrap_or(!0),
             });
 
+            // Record the [`RenderMeshInstance`].
             render_mesh_instances.insert(
                 entity,
                 RenderMeshInstanceGpu {
@@ -747,9 +858,15 @@ impl GetBatchData for MeshPipeline {
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(u32, Option<Self::CompareData>)> {
+        // This should only be called during GPU building.
         let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
+            error!(
+                "`get_batch_index` should never be called in CPU mesh uniform \
+                building mode"
+            );
             return None;
         };
+
         let mesh_instance = mesh_instances.get(&entity)?;
         let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
 
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 492019eebea9b..7dea0b02728da 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -57,18 +57,60 @@ impl<T: PartialEq> BatchMeta<T> {
     }
 }
 
+/// The GPU buffers holding the data needed to render batches.
+///
+/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
+/// `BD` type parameter in that mode.
+///
+/// There are two setups here, one for CPU uniform building and one for GPU
+/// uniform building. The CPU uniform setup is simple: there's one *buffer data*
+/// (`BD`) type per instance. GPU uniform building has a separate *buffer data
+/// input* type (`BDI`), which a compute shader is expected to expand to the
+/// full buffer data (`BD`) type. GPU uniform building is generally faster and
+/// uses less GPU bus bandwidth, but only implemented for some pipelines (for
+/// example, not in the 2D pipeline at present) and only when compute shader is
+/// available.
 #[derive(Resource)]
 pub enum BatchedInstanceBuffers<BD, BDI>
 where
     BD: GpuArrayBufferable + Sync + Send + 'static,
     BDI: Pod,
 {
+    /// The single buffer containing instances, used when GPU uniform building
+    /// isn't available.
     CpuBuilt(GpuArrayBuffer<BD>),
+
+    /// The buffers containing per-instance data used when GPU uniform building
+    /// is in use.
     GpuBuilt {
+        /// A storage area for the buffer data that the GPU compute shader is
+        /// expected to write to.
+        ///
+        /// There will be one entry for each index.
         data_buffer: UninitBufferVec<BD>,
+
+        /// The index of the buffer data in the current input buffer that
+        /// corresponds to each instance.
+        ///
+        /// It's entirely possible for indices to be duplicated in this list.
+        /// This typically occurs when an entity is visible from multiple views:
+        /// e.g. the main camera plus a shadow map.
         index_buffer: BufferVec<u32>,
+
+        /// The uniform data inputs for the current frame.
+        ///
+        /// These are uploaded during the extraction phase.
         current_input_buffer: BufferVec<BDI>,
+
+        /// The uniform data inputs for the previous frame.
+        ///
+        /// The indices don't generally line up between `current_input_buffer`
+        /// and `previous_input_buffer`, because, among other reasons, entities
+        /// can spawn or despawn between frames. Instead, each current buffer
+        /// data input uniform is expected to contain the index of the
+        /// corresponding buffer data input uniform in this list.
         previous_input_buffer: BufferVec<BDI>,
+
         /// The number of indices this frame.
         ///
         /// This is different from `index_buffer.len()` because `index_buffer`
@@ -82,6 +124,7 @@ where
     BD: GpuArrayBufferable + Sync + Send + 'static,
     BDI: Pod,
 {
+    /// Creates new buffers.
     pub fn new(render_device: &RenderDevice, using_gpu_uniform_builder: bool) -> Self {
         if !using_gpu_uniform_builder {
             return BatchedInstanceBuffers::CpuBuilt(GpuArrayBuffer::new(render_device));
@@ -96,6 +139,11 @@ where
         }
     }
 
+    /// Returns the binding of the uniform buffer that contains the per-instance
+    /// data.
+    ///
+    /// If we're in the GPU uniform building mode, this buffer needs to be
+    /// filled in via a compute shader.
     pub fn uniform_binding(&self) -> Option<BindingResource> {
         match *self {
             BatchedInstanceBuffers::CpuBuilt(ref buffer) => buffer.binding(),
@@ -119,16 +167,31 @@ pub trait GetBatchData {
     /// The per-instance data to be inserted into the [`GpuArrayBuffer`]
     /// containing these data for all instances.
     type BufferData: GpuArrayBufferable + Sync + Send + 'static;
+    /// The per-instance data that was inserted into the [`BufferVec`] during
+    /// extraction.
+    ///
+    /// This is only used when building uniforms on GPU. If this pipeline
+    /// doesn't support GPU uniform building (e.g. the 2D mesh pipeline), this
+    /// can safely be `()`.
     type BufferInputData: Pod + Sync + Send;
     /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
     /// If the instance can be batched, also return the data used for
     /// comparison when deciding whether draws can be batched, else return None
     /// for the `CompareData`.
+    ///
+    /// This is only called when building uniforms on CPU. In the GPU uniform
+    /// building path, we use [`GetBatchData::get_batch_index`] instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)>;
-    /// Same as the above, but for GPU uniform building.
+    /// Returns the index of the mesh instance in the buffer, if GPU uniform
+    /// building is in use.
+    ///
+    /// This needs only the index, because we already inserted the
+    /// [`GetBatchData::BufferInputData`] during the extraction phase before we
+    /// got here. If CPU uniform building is in use, this function will never be
+    /// called.
     fn get_batch_index(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
@@ -221,7 +284,11 @@ pub fn write_batched_instance_buffer<F: GetBatchData>(
         } => {
             data_buffer.write_buffer(&render_device);
             index_buffer.write_buffer(&render_device, &render_queue);
+
+            // Save the index count before we clear it out. Rendering will need
+            // it.
             *index_count = index_buffer.len();
+
             current_input_buffer.write_buffer(&render_device, &render_queue);
             // There's no need to write `previous_input_buffer`, as we wrote
             // that on the previous frame, and it hasn't changed.
diff --git a/crates/bevy_render/src/maths.wgsl b/crates/bevy_render/src/maths.wgsl
index 757f84d1e56b1..04b2f3a504c5f 100644
--- a/crates/bevy_render/src/maths.wgsl
+++ b/crates/bevy_render/src/maths.wgsl
@@ -28,10 +28,13 @@ fn mat2x4_f32_to_mat3x3_unpack(
     );
 }
 
+// Extracts the square portion of an affine matrix: i.e. discards the
+// translation.
 fn affine3_to_mat3x3(affine: mat4x3<f32>) -> mat3x3<f32> {
     return mat3x3<f32>(affine[0].xyz, affine[1].xyz, affine[2].xyz);
 }
 
+// Returns the inverse of a 3x3 matrix.
 fn inverse_mat3x3(matrix: mat3x3<f32>) -> mat3x3<f32> {
     let tmp0 = cross(matrix[1], matrix[2]);
     let tmp1 = cross(matrix[2], matrix[0]);
@@ -40,6 +43,10 @@ fn inverse_mat3x3(matrix: mat3x3<f32>) -> mat3x3<f32> {
     return transpose(mat3x3<f32>(tmp0 * inv_det, tmp1 * inv_det, tmp2 * inv_det));
 }
 
+// Returns the inverse of an affine matrix.
+//
+// Recall that an affine matrix is just a 4x4 matrix with the last column of [0,
+// 0, 0, 1]; thus the inverse is well-defined.
 fn inverse_affine3(affine: mat4x3<f32>) -> mat4x3<f32> {
     let matrix3 = affine3_to_mat3x3(affine);
     let inv_matrix3 = inverse_mat3x3(matrix3);
diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs
index bb8aa06ada3a0..fb3dba0072458 100644
--- a/crates/bevy_render/src/render_resource/buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/buffer_vec.rs
@@ -165,6 +165,14 @@ impl<T: Pod> Extend<T> for BufferVec<T> {
     }
 }
 
+/// Like a [`BufferVec`], but only reserves space on the GPU for elements
+/// instead of initializing them CPU-side.
+///
+/// This type is useful when you're accumulating "output slots" for a GPU
+/// compute shader to write into.
+///
+/// The type `T` need not be [`Pod`], unlike [`BufferVec`]; it only has to be
+/// [`GpuArrayBufferable`].
 pub struct UninitBufferVec<T>
 where
     T: GpuArrayBufferable,
@@ -183,6 +191,7 @@ impl<T> UninitBufferVec<T>
 where
     T: GpuArrayBufferable,
 {
+    /// Creates a new [`UninitBufferVec`] with the given [`BufferUsages`].
     pub const fn new(buffer_usage: BufferUsages) -> Self {
         Self {
             len: 0,
@@ -196,25 +205,33 @@ where
         }
     }
 
+    /// Returns the buffer, if allocated.
     #[inline]
     pub fn buffer(&self) -> Option<&Buffer> {
         self.buffer.as_ref()
     }
 
+    /// Reserves space for one more element in the buffer and returns its index.
     pub fn add(&mut self) -> usize {
         let index = self.len;
         self.len += 1;
         index
     }
 
+    /// Returns true if no elements have been added to this [`UninitBufferVec`].
     pub fn is_empty(&self) -> bool {
         self.len == 0
     }
 
+    /// Removes all elements from the buffer.
     pub fn clear(&mut self) {
         self.len = 0;
     }
 
+    /// Materializes the buffer on the GPU with space for `capacity` elements.
+    ///
+    /// If the buffer is already big enough, this function doesn't reallocate
+    /// the buffer.
     pub fn reserve(&mut self, capacity: usize, device: &RenderDevice) {
         if capacity <= self.capacity && !self.label_changed {
             return;
@@ -232,6 +249,8 @@ where
         self.label_changed = false;
     }
 
+    /// Materializes the buffer on the GPU, with an appropriate size for the
+    /// elements that have been pushed so far.
     pub fn write_buffer(&mut self, device: &RenderDevice) {
         if !self.is_empty() {
             self.reserve(self.len, device);
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 00fcf4b695ebe..e0de1c190ea9f 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -34,7 +34,6 @@ use bevy_render::{
 };
 use bevy_transform::components::GlobalTransform;
 use bevy_utils::tracing::error;
-use bytemuck::{Pod, Zeroable};
 
 use crate::Material2dBindGroupId;
 
@@ -157,7 +156,7 @@ pub struct Mesh2dTransforms {
     pub flags: u32,
 }
 
-#[derive(ShaderType, Clone, Pod, Copy, Zeroable)]
+#[derive(ShaderType, Clone)]
 #[repr(C)]
 pub struct Mesh2dUniform {
     // Affine 4x3 matrix transposed to 3x4
@@ -169,7 +168,6 @@ pub struct Mesh2dUniform {
     pub inverse_transpose_model_a: [Vec4; 2],
     pub inverse_transpose_model_b: f32,
     pub flags: u32,
-    pub pad: [u32; 2],
 }
 
 impl From<&Mesh2dTransforms> for Mesh2dUniform {
@@ -181,7 +179,6 @@ impl From<&Mesh2dTransforms> for Mesh2dUniform {
             inverse_transpose_model_a,
             inverse_transpose_model_b,
             flags: mesh_transforms.flags,
-            pad: [0; 2],
         }
     }
 }

From 9fe6ffb73812a1c92ecda941df63e176b460233c Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sat, 30 Mar 2024 12:36:17 -0700
Subject: [PATCH 03/39] Extract the lightmap UV rect.

---
 crates/bevy_pbr/src/render/mesh.rs | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 81b4657e783f8..b4203dbacc972 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -390,6 +390,16 @@ pub struct RenderMeshInstanceGpuBuilder {
     pub shared: RenderMeshInstanceShared,
     /// The current transform.
     pub transform: Affine3,
+    /// Four 16-bit unsigned normalized UV values packed into a [`UVec2`]:
+    ///
+    ///                         <--- MSB                   LSB --->
+    ///                         +---- min v ----+ +---- min u ----+
+    ///     lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu,
+    ///                         +---- max v ----+ +---- max u ----+
+    ///     lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU,
+    ///
+    /// (MSB: most significant bit; LSB: least significant bit.)
+    pub lightmap_uv_rect: UVec2,
     /// Various flags.
     pub mesh_flags: MeshFlags,
 }
@@ -593,6 +603,7 @@ pub fn extract_meshes_for_gpu_building(
             &ViewVisibility,
             &GlobalTransform,
             Option<&PreviousGlobalTransform>,
+            Option<&Lightmap>,
             &Handle<Mesh>,
             Has<NotShadowReceiver>,
             Has<TransmittedShadowReceiver>,
@@ -607,6 +618,7 @@ pub fn extract_meshes_for_gpu_building(
             view_visibility,
             transform,
             previous_transform,
+            lightmap,
             handle,
             not_shadow_receiver,
             transmitted_receiver,
@@ -627,12 +639,16 @@ pub fn extract_meshes_for_gpu_building(
                 no_automatic_batching,
             );
 
+            let lightmap_uv_rect =
+                lightmap::pack_lightmap_uv_rect(lightmap.map(|lightmap| lightmap.uv_rect));
+
             render_mesh_instance_queues.scope(|queue| {
                 queue.push((
                     entity,
                     RenderMeshInstanceGpuBuilder {
                         shared,
                         transform: (&transform.affine()).into(),
+                        lightmap_uv_rect,
                         mesh_flags,
                     },
                 ));
@@ -697,8 +713,7 @@ fn collect_meshes_for_gpu_building(
             // Push the mesh input uniform.
             let current_uniform_index = current_input_buffer.push(MeshInputUniform {
                 transform: builder.transform.to_transpose(),
-                // TODO: Track this.
-                lightmap_uv_rect: lightmap::pack_lightmap_uv_rect(None),
+                lightmap_uv_rect: builder.lightmap_uv_rect,
                 flags: builder.mesh_flags.bits(),
                 previous_input_index: previous_input_index.unwrap_or(!0),
             });

From bc764fc309ae57ccf9163fe815a9d50458d3e0da Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sat, 30 Mar 2024 13:52:30 -0700
Subject: [PATCH 04/39] Remove needless `warn!`

---
 crates/bevy_pbr/src/render/build_mesh_uniforms.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
index ac6da641c940f..9fe94a20cd275 100644
--- a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
+++ b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
@@ -139,7 +139,7 @@ impl ViewNode for BuildMeshUniformsNode {
         let Some(view_build_mesh_uniforms_pipeline) =
             pipeline_cache.get_compute_pipeline(build_mesh_uniforms_pipeline_id)
         else {
-            warn!("The view build mesh uniforms pipeline wasn't present in the pipeline cache");
+            // This will happen while the pipeline is being compiled and is fine.
             return Ok(());
         };
 

From 02d5a21546d5c985872a75b07e9d46d4d23950d4 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sat, 30 Mar 2024 15:25:03 -0700
Subject: [PATCH 05/39] Prepare the bind group in a separate system

---
 .../src/render/build_mesh_uniforms.rs         | 116 +++++++++++-------
 1 file changed, 72 insertions(+), 44 deletions(-)

diff --git a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
index 9fe94a20cd275..f5c058d369fec 100644
--- a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
+++ b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
@@ -20,9 +20,10 @@ use bevy_render::{
     render_graph::{NodeRunError, RenderGraphApp, RenderGraphContext, ViewNode, ViewNodeRunner},
     render_resource::{
         binding_types::{storage_buffer, storage_buffer_read_only},
-        BindGroupEntries, BindGroupLayout, CachedComputePipelineId, ComputePassDescriptor,
-        ComputePipelineDescriptor, DynamicBindGroupLayoutEntries, PipelineCache, Shader,
-        ShaderStages, SpecializedComputePipeline, SpecializedComputePipelines,
+        BindGroup, BindGroupEntries, BindGroupLayout, CachedComputePipelineId,
+        ComputePassDescriptor, ComputePipelineDescriptor, DynamicBindGroupLayoutEntries,
+        PipelineCache, Shader, ShaderStages, SpecializedComputePipeline,
+        SpecializedComputePipelines,
     },
     renderer::{RenderContext, RenderDevice},
     Render, RenderApp, RenderSet,
@@ -59,6 +60,10 @@ pub struct BuildMeshUniformsPipeline {
     pub pipeline_id: Option<CachedComputePipelineId>,
 }
 
+/// The compute shader bind group for the mesh uniform building pass.
+#[derive(Resource, Default)]
+pub struct BuildMeshUniformsBindGroup(Option<BindGroup>);
+
 impl Plugin for BuildMeshUniformsPlugin {
     fn build(&self, app: &mut App) {
         load_internal_asset!(
@@ -74,7 +79,10 @@ impl Plugin for BuildMeshUniformsPlugin {
 
         render_app.add_systems(
             Render,
-            prepare_build_mesh_uniforms_pipeline.in_set(RenderSet::Prepare),
+            (
+                prepare_build_mesh_uniforms_pipeline.in_set(RenderSet::Prepare),
+                prepare_build_mesh_uniforms_bind_group.in_set(RenderSet::PrepareBindGroups),
+            ),
         );
     }
 
@@ -98,6 +106,7 @@ impl Plugin for BuildMeshUniformsPlugin {
                 ),
             )
             .init_resource::<BuildMeshUniformsPipeline>()
+            .init_resource::<BuildMeshUniformsBindGroup>()
             .init_resource::<SpecializedComputePipelines<BuildMeshUniformsPipeline>>();
     }
 }
@@ -112,22 +121,24 @@ impl ViewNode for BuildMeshUniformsNode {
         _: QueryItem<'w, Self::ViewQuery>,
         world: &'w World,
     ) -> Result<(), NodeRunError> {
-        // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh uniform
-        let BatchedInstanceBuffers::GpuBuilt {
-            data_buffer: ref data_buffer_vec,
-            index_buffer: ref index_buffer_vec,
-            current_input_buffer: ref current_input_buffer_vec,
-            previous_input_buffer: ref previous_input_buffer_vec,
-            index_count,
-        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
+        // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh
+        // uniform building, bail out.
+        let BatchedInstanceBuffers::GpuBuilt { index_count, .. } =
+            world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
         else {
             error!(
-                "Attempted to build mesh uniforms on GPU, but `GpuBuilt` batched instance \
-                buffers weren't available"
+                "Attempted to build mesh uniforms on GPU, but `GpuBuilt` batched instance buffers \
+                weren't available"
             );
             return Ok(());
         };
 
+        // Grab the bind group.
+        let Some(ref bind_group) = world.resource::<BuildMeshUniformsBindGroup>().0 else {
+            error!("Attempted to build mesh uniforms on GPU, but the bind group wasn't available");
+            return Ok(());
+        };
+
         let pipeline_cache = world.resource::<PipelineCache>();
         let build_mesh_uniforms_pipeline = world.resource::<BuildMeshUniformsPipeline>();
 
@@ -143,35 +154,6 @@ impl ViewNode for BuildMeshUniformsNode {
             return Ok(());
         };
 
-        let Some(current_input_buffer) = current_input_buffer_vec.buffer() else {
-            warn!("The current input buffer wasn't uploaded");
-            return Ok(());
-        };
-        let Some(previous_input_buffer) = previous_input_buffer_vec.buffer() else {
-            // This will happen on the first frame and is fine.
-            return Ok(());
-        };
-        let Some(index_buffer) = index_buffer_vec.buffer() else {
-            warn!("The index buffer wasn't uploaded");
-            return Ok(());
-        };
-        let Some(data_buffer) = data_buffer_vec.buffer() else {
-            warn!("The data buffer wasn't uploaded");
-            return Ok(());
-        };
-
-        // TODO: Do this in a separate system and cache it.
-        let bind_group = render_context.render_device().create_bind_group(
-            "build_mesh_uniforms_bind_group",
-            &build_mesh_uniforms_pipeline.bind_group_layout,
-            &BindGroupEntries::sequential((
-                current_input_buffer.as_entire_binding(),
-                previous_input_buffer.as_entire_binding(),
-                index_buffer.as_entire_binding(),
-                data_buffer.as_entire_binding(),
-            )),
-        );
-
         let mut compute_pass =
             render_context
                 .command_encoder()
@@ -181,7 +163,7 @@ impl ViewNode for BuildMeshUniformsNode {
                 });
 
         compute_pass.set_pipeline(view_build_mesh_uniforms_pipeline);
-        compute_pass.set_bind_group(0, &bind_group, &[]);
+        compute_pass.set_bind_group(0, bind_group, &[]);
         let workgroup_count = div_round_up(*index_count, WORKGROUP_SIZE);
         compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
 
@@ -250,6 +232,52 @@ pub fn prepare_build_mesh_uniforms_pipeline(
     build_mesh_uniforms_pipeline.pipeline_id = Some(build_mesh_uniforms_pipeline_id);
 }
 
+pub fn prepare_build_mesh_uniforms_bind_group(
+    render_device: Res<RenderDevice>,
+    batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    pipeline: Res<BuildMeshUniformsPipeline>,
+    mut bind_group: ResMut<BuildMeshUniformsBindGroup>,
+) {
+    // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh
+    // uniform building, bail out.
+    let BatchedInstanceBuffers::GpuBuilt {
+        data_buffer: ref data_buffer_vec,
+        index_buffer: ref index_buffer_vec,
+        current_input_buffer: ref current_input_buffer_vec,
+        previous_input_buffer: ref previous_input_buffer_vec,
+        index_count: _,
+    } = *batched_instance_buffers
+    else {
+        return;
+    };
+
+    let (
+        Some(current_input_buffer),
+        Some(previous_input_buffer),
+        Some(index_buffer),
+        Some(data_buffer),
+    ) = (
+        current_input_buffer_vec.buffer(),
+        previous_input_buffer_vec.buffer(),
+        index_buffer_vec.buffer(),
+        data_buffer_vec.buffer(),
+    )
+    else {
+        return;
+    };
+
+    bind_group.0 = Some(render_device.create_bind_group(
+        "build_mesh_uniforms_bind_group",
+        &pipeline.bind_group_layout,
+        &BindGroupEntries::sequential((
+            current_input_buffer.as_entire_binding(),
+            previous_input_buffer.as_entire_binding(),
+            index_buffer.as_entire_binding(),
+            data_buffer.as_entire_binding(),
+        )),
+    ));
+}
+
 /// Returns `a / b`, rounded toward positive infinity.
 fn div_round_up(a: usize, b: usize) -> usize {
     (a + b - 1) / b

From c83c47a12cb0a1188081b47dc4f61987dd335b41 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sat, 30 Mar 2024 15:31:17 -0700
Subject: [PATCH 06/39] Add some more documentation

---
 crates/bevy_pbr/src/render/build_mesh_uniforms.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
index f5c058d369fec..d5e76c12a1563 100644
--- a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
+++ b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
@@ -232,6 +232,8 @@ pub fn prepare_build_mesh_uniforms_pipeline(
     build_mesh_uniforms_pipeline.pipeline_id = Some(build_mesh_uniforms_pipeline_id);
 }
 
+/// A system that attaches the mesh uniform buffers to the bind group for the
+/// compute shader.
 pub fn prepare_build_mesh_uniforms_bind_group(
     render_device: Res<RenderDevice>,
     batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,

From 4559bd46f7c6974487090684c37d8d0b8a808a0b Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sun, 31 Mar 2024 19:03:04 -0700
Subject: [PATCH 07/39] `using_gpu_uniform_builder` ->
 `use_gpu_uniform_builder`

---
 crates/bevy_pbr/src/lib.rs             |  8 ++++----
 crates/bevy_pbr/src/render/mesh.rs     | 14 +++++++-------
 crates/bevy_render/src/batching/mod.rs |  4 ++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index 70c1e76533065..0bc4c23a2339e 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -138,7 +138,7 @@ pub struct PbrPlugin {
     /// Controls if GPU [`MeshUniform`] building is enabled.
     ///
     /// This requires compute shader support.
-    pub using_gpu_uniform_builder: bool,
+    pub use_gpu_uniform_builder: bool,
 }
 
 impl Default for PbrPlugin {
@@ -149,7 +149,7 @@ impl Default for PbrPlugin {
 
             // The GPU uniform builder requires compute shaders, which aren't
             // available on any version of WebGL.
-            using_gpu_uniform_builder: cfg!(any(
+            use_gpu_uniform_builder: cfg!(any(
                 feature = "webgpu",
                 not(feature = "webgl"),
                 not(target_arch = "wasm32"),
@@ -295,7 +295,7 @@ impl Plugin for PbrPlugin {
             .init_resource::<DefaultOpaqueRendererMethod>()
             .add_plugins((
                 MeshRenderPlugin {
-                    using_gpu_uniform_builder: self.using_gpu_uniform_builder,
+                    use_gpu_uniform_builder: self.use_gpu_uniform_builder,
                 },
                 MaterialPlugin::<StandardMaterial> {
                     prepass_enabled: self.prepass_enabled,
@@ -368,7 +368,7 @@ impl Plugin for PbrPlugin {
             app.add_plugins(DeferredPbrLightingPlugin);
         }
 
-        if self.using_gpu_uniform_builder {
+        if self.use_gpu_uniform_builder {
             app.add_plugins(BuildMeshUniformsPlugin);
         }
 
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index b4203dbacc972..8931c3f104589 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -52,7 +52,7 @@ pub struct MeshRenderPlugin {
     /// Whether we're building [`MeshUniform`]s on GPU.
     ///
     /// If this is false, we're building them on CPU.
-    pub using_gpu_uniform_builder: bool,
+    pub use_gpu_uniform_builder: bool,
 }
 
 pub const FORWARD_IO_HANDLE: Handle<Shader> = Handle::weak_from_u128(2645551199423808407);
@@ -79,7 +79,7 @@ pub const MESH_PIPELINE_VIEW_LAYOUT_SAFE_MAX_TEXTURES: usize = 10;
 impl Default for MeshRenderPlugin {
     fn default() -> Self {
         Self {
-            using_gpu_uniform_builder: true,
+            use_gpu_uniform_builder: true,
         }
     }
 }
@@ -126,7 +126,7 @@ impl Plugin for MeshRenderPlugin {
         );
 
         if let Ok(render_app) = app.get_sub_app_mut(RenderApp) {
-            let render_mesh_instances = RenderMeshInstances::new(self.using_gpu_uniform_builder);
+            let render_mesh_instances = RenderMeshInstances::new(self.use_gpu_uniform_builder);
 
             render_app
                 .init_resource::<MeshBindGroups>()
@@ -169,7 +169,7 @@ impl Plugin for MeshRenderPlugin {
                     ),
                 );
 
-            if self.using_gpu_uniform_builder {
+            if self.use_gpu_uniform_builder {
                 render_app.add_systems(ExtractSchedule, extract_meshes_for_gpu_building);
             } else {
                 render_app.add_systems(ExtractSchedule, extract_meshes_for_cpu_building);
@@ -185,7 +185,7 @@ impl Plugin for MeshRenderPlugin {
             let batched_instance_buffers =
                 BatchedInstanceBuffers::<MeshUniform, MeshInputUniform>::new(
                     render_device,
-                    self.using_gpu_uniform_builder,
+                    self.use_gpu_uniform_builder,
                 );
 
             if let Some(per_object_buffer_batch_size) =
@@ -453,8 +453,8 @@ pub enum RenderMeshInstances {
 }
 
 impl RenderMeshInstances {
-    fn new(using_gpu_uniform_builder: bool) -> RenderMeshInstances {
-        if using_gpu_uniform_builder {
+    fn new(use_gpu_uniform_builder: bool) -> RenderMeshInstances {
+        if use_gpu_uniform_builder {
             RenderMeshInstances::GpuBuilding(EntityHashMap::default())
         } else {
             RenderMeshInstances::CpuBuilding(EntityHashMap::default())
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index c7132bf6baaaa..6f1e468f8212d 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -132,8 +132,8 @@ where
     BDI: Pod,
 {
     /// Creates new buffers.
-    pub fn new(render_device: &RenderDevice, using_gpu_uniform_builder: bool) -> Self {
-        if !using_gpu_uniform_builder {
+    pub fn new(render_device: &RenderDevice, use_gpu_uniform_builder: bool) -> Self {
+        if !use_gpu_uniform_builder {
             return BatchedInstanceBuffers::CpuBuilt(GpuArrayBuffer::new(render_device));
         }
 

From 1bd64b750bee028676572b197e67b133394a6688 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 1 Apr 2024 14:47:46 -0700
Subject: [PATCH 08/39] Remove error spam

---
 crates/bevy_pbr/src/render/build_mesh_uniforms.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
index 42123880f4195..45efc8a0cf096 100644
--- a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
+++ b/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
@@ -135,7 +135,7 @@ impl ViewNode for BuildMeshUniformsNode {
 
         // Grab the bind group.
         let Some(ref bind_group) = world.resource::<BuildMeshUniformsBindGroup>().0 else {
-            error!("Attempted to build mesh uniforms on GPU, but the bind group wasn't available");
+            // This can happen for the first frame or so and is fine.
             return Ok(());
         };
 

From a6e8a62e09a4251f84df1c5520c3a18dfeb31bc7 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 1 Apr 2024 14:56:51 -0700
Subject: [PATCH 09/39] Use a system set instead of using `after`

---
 crates/bevy_pbr/src/lightmap/mod.rs | 11 ++++-------
 crates/bevy_pbr/src/render/mesh.rs  | 15 +++++++++++++--
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/crates/bevy_pbr/src/lightmap/mod.rs b/crates/bevy_pbr/src/lightmap/mod.rs
index a624468eb41cd..e1333723be63c 100644
--- a/crates/bevy_pbr/src/lightmap/mod.rs
+++ b/crates/bevy_pbr/src/lightmap/mod.rs
@@ -46,7 +46,7 @@ use bevy_render::{
 };
 use bevy_utils::HashSet;
 
-use crate::RenderMeshInstances;
+use crate::{ExtractMeshesSet, RenderMeshInstances};
 
 /// The ID of the lightmap shader.
 pub const LIGHTMAP_SHADER_HANDLE: Handle<Shader> =
@@ -130,12 +130,9 @@ impl Plugin for LightmapPlugin {
             return;
         };
 
-        render_app.init_resource::<RenderLightmaps>().add_systems(
-            ExtractSchedule,
-            extract_lightmaps
-                .after(crate::extract_meshes_for_cpu_building)
-                .after(crate::extract_meshes_for_gpu_building),
-        );
+        render_app
+            .init_resource::<RenderLightmaps>()
+            .add_systems(ExtractSchedule, extract_lightmaps.after(ExtractMeshesSet));
     }
 }
 
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 9a55badf4da27..8c696f68c8847 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -170,9 +170,15 @@ impl Plugin for MeshRenderPlugin {
                 );
 
             if self.use_gpu_uniform_builder {
-                render_app.add_systems(ExtractSchedule, extract_meshes_for_gpu_building);
+                render_app.add_systems(
+                    ExtractSchedule,
+                    extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
+                );
             } else {
-                render_app.add_systems(ExtractSchedule, extract_meshes_for_cpu_building);
+                render_app.add_systems(
+                    ExtractSchedule,
+                    extract_meshes_for_cpu_building.in_set(ExtractMeshesSet),
+                );
             }
         }
     }
@@ -504,6 +510,11 @@ pub struct RenderMeshQueueData<'a> {
     pub translation: Vec3,
 }
 
+/// A [`SystemSet`] that encompasses both [`extract_meshes_for_cpu_building`]
+/// and [`extract_meshes_for_gpu_building`].
+#[derive(SystemSet, Clone, PartialEq, Eq, Debug, Hash)]
+pub struct ExtractMeshesSet;
+
 /// Extracts meshes from the main world into the render world, populating the
 /// [`RenderMeshInstances`].
 ///

From 49cdeee4e66dc5ab8fd7b19930af3622134634d5 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 1 Apr 2024 22:11:33 -0700
Subject: [PATCH 10/39] Make GPU mesh preparation per-view

---
 crates/bevy_pbr/src/lib.rs                    |  13 +-
 crates/bevy_pbr/src/prepass/mod.rs            |  21 +-
 ...ild_mesh_uniforms.rs => gpu_preprocess.rs} | 195 ++++-----
 crates/bevy_pbr/src/render/mesh.rs            |  48 +--
 ...esh_uniforms.wgsl => mesh_preprocess.wgsl} |  24 +-
 crates/bevy_pbr/src/render/mod.rs             |   4 +-
 crates/bevy_render/src/batching/mod.rs        | 381 +++++++++++++-----
 crates/bevy_render/src/render_phase/mod.rs    |  91 ++++-
 crates/bevy_sprite/src/mesh2d/mesh.rs         |   2 +-
 9 files changed, 509 insertions(+), 270 deletions(-)
 rename crates/bevy_pbr/src/render/{build_mesh_uniforms.rs => gpu_preprocess.rs} (54%)
 rename crates/bevy_pbr/src/render/{build_mesh_uniforms.wgsl => mesh_preprocess.wgsl} (76%)

diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index c47b5316120f9..517711e21e035 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -80,7 +80,7 @@ pub mod graph {
         ScreenSpaceAmbientOcclusion,
         DeferredLightingPass,
         /// Label for the compute shader mesh uniforms building pass.
-        BuildMeshUniforms,
+        GpuPreprocess,
     }
 }
 
@@ -369,7 +369,7 @@ impl Plugin for PbrPlugin {
         }
 
         if self.use_gpu_uniform_builder {
-            app.add_plugins(BuildMeshUniformsPlugin);
+            app.add_plugins(GpuMeshPreprocessPlugin);
         }
 
         app.world_mut().resource_mut::<Assets<StandardMaterial>>().insert(
@@ -404,15 +404,6 @@ impl Plugin for PbrPlugin {
         let draw_3d_graph = graph.get_sub_graph_mut(Core3d).unwrap();
         draw_3d_graph.add_node(NodePbr::ShadowPass, shadow_pass_node);
         draw_3d_graph.add_node_edge(NodePbr::ShadowPass, Node3d::StartMainPass);
-
-        render_app.ignore_ambiguity(
-            bevy_render::Render,
-            bevy_core_pipeline::core_3d::prepare_core_3d_transmission_textures,
-            bevy_render::batching::batch_and_prepare_sorted_render_phase::<
-                bevy_core_pipeline::core_3d::Transmissive3d,
-                MeshPipeline,
-            >,
-        );
     }
 
     fn finish(&self, app: &mut App) {
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index e6ce3a4eaed84..3a57cd30655ec 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -1,6 +1,5 @@
 mod prepass_bindings;
 
-use bevy_render::batching::{batch_and_prepare_binned_render_phase, sort_binned_render_phase};
 use bevy_render::mesh::MeshVertexBufferLayoutRef;
 use bevy_render::render_resource::binding_types::uniform_buffer;
 pub use prepass_bindings::*;
@@ -148,6 +147,13 @@ where
                 );
         }
 
+        if no_prepass_plugin_loaded {
+            app.add_plugins((
+                BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::default(),
+                BinnedRenderPhasePlugin::<AlphaMask3dPrepass, MeshPipeline>::default(),
+            ));
+        }
+
         let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
             return;
         };
@@ -157,18 +163,7 @@ where
                 .add_systems(ExtractSchedule, extract_camera_previous_view_projection)
                 .add_systems(
                     Render,
-                    (
-                        (
-                            sort_binned_render_phase::<Opaque3dPrepass>,
-                            sort_binned_render_phase::<AlphaMask3dPrepass>
-                        ).in_set(RenderSet::PhaseSort),
-                        (
-                            prepare_previous_view_projection_uniforms,
-                            batch_and_prepare_binned_render_phase::<Opaque3dPrepass, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<AlphaMask3dPrepass,
-                                MeshPipeline>,
-                        ).in_set(RenderSet::PrepareResources),
-                    )
+                    (prepare_previous_view_projection_uniforms).in_set(RenderSet::PrepareResources),
                 );
         }
 
diff --git a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
similarity index 54%
rename from crates/bevy_pbr/src/render/build_mesh_uniforms.rs
rename to crates/bevy_pbr/src/render/gpu_preprocess.rs
index 45efc8a0cf096..0e9ae84c600dd 100644
--- a/crates/bevy_pbr/src/render/build_mesh_uniforms.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -1,4 +1,4 @@
-//! GPU mesh uniform building.
+//! GPU mesh preprocessing.
 //!
 //! This is an optional pass that uses a compute shader to reduce the amount of
 //! data that has to be transferred from the CPU to the GPU. When enabled,
@@ -8,16 +8,18 @@
 
 use bevy_app::{App, Plugin};
 use bevy_asset::{load_internal_asset, Handle};
-use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
+use bevy_core_pipeline::core_3d::graph::Core3d;
 use bevy_ecs::{
-    query::QueryItem,
+    component::Component,
+    entity::Entity,
+    query::QueryState,
     schedule::IntoSystemConfigs as _,
-    system::{Res, ResMut, Resource},
+    system::{lifetimeless::Read, Commands, Res, ResMut, Resource},
     world::{FromWorld, World},
 };
 use bevy_render::{
-    batching::BatchedInstanceBuffers,
-    render_graph::{NodeRunError, RenderGraphApp, RenderGraphContext, ViewNode, ViewNodeRunner},
+    batching::{BatchedInstanceBuffers, PreprocessWorkItem},
+    render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
     render_resource::{
         binding_types::{storage_buffer, storage_buffer_read_only},
         BindGroup, BindGroupEntries, BindGroupLayout, CachedComputePipelineId,
@@ -32,8 +34,8 @@ use bevy_utils::tracing::{error, warn};
 
 use crate::{graph::NodePbr, MeshInputUniform, MeshUniform};
 
-/// The handle to the `build_mesh_uniforms.wgsl` compute shader.
-pub const BUILD_MESH_UNIFORMS_SHADER_HANDLE: Handle<Shader> =
+/// The handle to the `mesh_preprocess.wgsl` compute shader.
+pub const MESH_PREPROCESS_SHADER_HANDLE: Handle<Shader> =
     Handle::weak_from_u128(16991728318640779533);
 
 /// The GPU workgroup size.
@@ -43,33 +45,36 @@ const WORKGROUP_SIZE: usize = 64;
 ///
 /// This will only be added if the platform supports compute shaders (e.g. not
 /// on WebGL 2).
-pub struct BuildMeshUniformsPlugin;
+pub struct GpuMeshPreprocessPlugin;
 
 /// The render node for the mesh uniform building pass.
-#[derive(Default)]
-pub struct BuildMeshUniformsNode;
+pub struct GpuPreprocessNode {
+    view_query: QueryState<(Entity, Read<PreprocessBindGroup>)>,
+}
 
 /// The compute shader pipeline for the mesh uniform building pass.
 #[derive(Resource)]
-pub struct BuildMeshUniformsPipeline {
+pub struct PreprocessPipeline {
     /// The single bind group layout for the compute shader.
     pub bind_group_layout: BindGroupLayout,
     /// The pipeline ID for the compute shader.
     ///
-    /// This gets filled in in `prepare_build_mesh_uniforms_pipeline`.
+    /// This gets filled in in `prepare_preprocess_pipeline`.
     pub pipeline_id: Option<CachedComputePipelineId>,
 }
 
 /// The compute shader bind group for the mesh uniform building pass.
-#[derive(Resource, Default)]
-pub struct BuildMeshUniformsBindGroup(Option<BindGroup>);
+///
+/// This goes on the view.
+#[derive(Component)]
+pub struct PreprocessBindGroup(BindGroup);
 
-impl Plugin for BuildMeshUniformsPlugin {
+impl Plugin for GpuMeshPreprocessPlugin {
     fn build(&self, app: &mut App) {
         load_internal_asset!(
             app,
-            BUILD_MESH_UNIFORMS_SHADER_HANDLE,
-            "build_mesh_uniforms.wgsl",
+            MESH_PREPROCESS_SHADER_HANDLE,
+            "mesh_preprocess.wgsl",
             Shader::from_wgsl
         );
 
@@ -80,8 +85,8 @@ impl Plugin for BuildMeshUniformsPlugin {
         render_app.add_systems(
             Render,
             (
-                prepare_build_mesh_uniforms_pipeline.in_set(RenderSet::Prepare),
-                prepare_build_mesh_uniforms_bind_group.in_set(RenderSet::PrepareBindGroups),
+                prepare_preprocess_pipeline.in_set(RenderSet::Prepare),
+                prepare_preprocess_bind_groups.in_set(RenderSet::PrepareBindGroups),
             ),
         );
     }
@@ -93,64 +98,58 @@ impl Plugin for BuildMeshUniformsPlugin {
 
         // Stitch the node in.
         render_app
-            .add_render_graph_node::<ViewNodeRunner<BuildMeshUniformsNode>>(
-                Core3d,
-                NodePbr::BuildMeshUniforms,
-            )
-            .add_render_graph_edges(
-                Core3d,
-                (
-                    Node3d::StartMainPass,
-                    NodePbr::BuildMeshUniforms,
-                    Node3d::MainOpaquePass,
-                ),
-            )
-            .init_resource::<BuildMeshUniformsPipeline>()
-            .init_resource::<BuildMeshUniformsBindGroup>()
-            .init_resource::<SpecializedComputePipelines<BuildMeshUniformsPipeline>>();
+            .add_render_graph_node::<GpuPreprocessNode>(Core3d, NodePbr::GpuPreprocess)
+            .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, NodePbr::ShadowPass))
+            .init_resource::<PreprocessPipeline>()
+            .init_resource::<SpecializedComputePipelines<PreprocessPipeline>>();
     }
 }
 
-impl ViewNode for BuildMeshUniformsNode {
-    type ViewQuery = ();
+impl FromWorld for GpuPreprocessNode {
+    fn from_world(world: &mut World) -> Self {
+        Self {
+            view_query: QueryState::new(world),
+        }
+    }
+}
+
+impl Node for GpuPreprocessNode {
+    fn update(&mut self, world: &mut World) {
+        self.view_query.update_archetypes(world);
+    }
 
     fn run<'w>(
         &self,
         _: &mut RenderGraphContext,
         render_context: &mut RenderContext<'w>,
-        _: QueryItem<'w, Self::ViewQuery>,
         world: &'w World,
     ) -> Result<(), NodeRunError> {
         // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh
         // uniform building, bail out.
-        let BatchedInstanceBuffers::GpuBuilt { index_count, .. } =
-            world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
+        let BatchedInstanceBuffers::GpuBuilt {
+            work_item_buffers: ref index_buffers,
+            ..
+        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
         else {
             error!(
-                "Attempted to build mesh uniforms on GPU, but `GpuBuilt` batched instance buffers \
+                "Attempted to preprocess meshes on GPU, but `GpuBuilt` batched instance buffers \
                 weren't available"
             );
             return Ok(());
         };
 
-        // Grab the bind group.
-        let Some(ref bind_group) = world.resource::<BuildMeshUniformsBindGroup>().0 else {
-            // This can happen for the first frame or so and is fine.
-            return Ok(());
-        };
-
         let pipeline_cache = world.resource::<PipelineCache>();
-        let build_mesh_uniforms_pipeline = world.resource::<BuildMeshUniformsPipeline>();
+        let preprocess_pipeline = world.resource::<PreprocessPipeline>();
 
-        let Some(build_mesh_uniforms_pipeline_id) = build_mesh_uniforms_pipeline.pipeline_id else {
+        let Some(preprocess_pipeline_id) = preprocess_pipeline.pipeline_id else {
             warn!("The build mesh uniforms pipeline wasn't uploaded");
             return Ok(());
         };
 
-        let Some(view_build_mesh_uniforms_pipeline) =
-            pipeline_cache.get_compute_pipeline(build_mesh_uniforms_pipeline_id)
+        let Some(preprocess_pipeline) = pipeline_cache.get_compute_pipeline(preprocess_pipeline_id)
         else {
             // This will happen while the pipeline is being compiled and is fine.
+            println!("No compute pipeline present!");
             return Ok(());
         };
 
@@ -158,35 +157,44 @@ impl ViewNode for BuildMeshUniformsNode {
             render_context
                 .command_encoder()
                 .begin_compute_pass(&ComputePassDescriptor {
-                    label: Some("build mesh uniforms"),
+                    label: Some("mesh preprocessing"),
                     timestamp_writes: None,
                 });
 
-        compute_pass.set_pipeline(view_build_mesh_uniforms_pipeline);
-        compute_pass.set_bind_group(0, bind_group, &[]);
-        let workgroup_count = div_round_up(*index_count, WORKGROUP_SIZE);
-        compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+        compute_pass.set_pipeline(preprocess_pipeline);
+
+        for (view, bind_group) in self.view_query.iter_manual(world) {
+            // Grab the index buffer for this view.
+            let Some(index_buffer) = index_buffers.get(&view) else {
+                warn!("The preprocessing index buffer wasn't present");
+                return Ok(());
+            };
+
+            compute_pass.set_bind_group(0, &bind_group.0, &[]);
+            let workgroup_count = div_round_up(index_buffer.buffer.len(), WORKGROUP_SIZE);
+            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+        }
 
         Ok(())
     }
 }
 
-impl SpecializedComputePipeline for BuildMeshUniformsPipeline {
+impl SpecializedComputePipeline for PreprocessPipeline {
     type Key = ();
 
     fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
         ComputePipelineDescriptor {
-            label: Some("build mesh uniforms".into()),
+            label: Some("mesh preprocessing".into()),
             layout: vec![self.bind_group_layout.clone()],
             push_constant_ranges: vec![],
-            shader: BUILD_MESH_UNIFORMS_SHADER_HANDLE,
+            shader: MESH_PREPROCESS_SHADER_HANDLE,
             shader_defs: vec![],
             entry_point: "main".into(),
         }
     }
 }
 
-impl FromWorld for BuildMeshUniformsPipeline {
+impl FromWorld for PreprocessPipeline {
     fn from_world(world: &mut World) -> Self {
         let render_device = world.resource::<RenderDevice>();
 
@@ -198,7 +206,7 @@ impl FromWorld for BuildMeshUniformsPipeline {
                 // `previous_input`
                 storage_buffer_read_only::<MeshInputUniform>(/*has_dynamic_offset=*/ false),
                 // `indices`
-                storage_buffer_read_only::<u32>(/*has_dynamic_offset=*/ false),
+                storage_buffer_read_only::<PreprocessWorkItem>(/*has_dynamic_offset=*/ false),
                 // `output`
                 storage_buffer::<MeshUniform>(/*has_dynamic_offset=*/ false),
             ),
@@ -209,75 +217,72 @@ impl FromWorld for BuildMeshUniformsPipeline {
             &bind_group_layout_entries,
         );
 
-        BuildMeshUniformsPipeline {
+        PreprocessPipeline {
             bind_group_layout,
             pipeline_id: None,
         }
     }
 }
 
-/// A system that specializes the `build_mesh_uniforms.wgsl` pipeline if
-/// necessary.
-pub fn prepare_build_mesh_uniforms_pipeline(
+/// A system that specializes the `mesh_preprocess.wgsl` pipeline if necessary.
+pub fn prepare_preprocess_pipeline(
     pipeline_cache: Res<PipelineCache>,
-    mut pipelines: ResMut<SpecializedComputePipelines<BuildMeshUniformsPipeline>>,
-    mut build_mesh_uniforms_pipeline: ResMut<BuildMeshUniformsPipeline>,
+    mut pipelines: ResMut<SpecializedComputePipelines<PreprocessPipeline>>,
+    mut preprocess_pipeline: ResMut<PreprocessPipeline>,
 ) {
-    if build_mesh_uniforms_pipeline.pipeline_id.is_some() {
+    if preprocess_pipeline.pipeline_id.is_some() {
         return;
     }
 
-    let build_mesh_uniforms_pipeline_id =
-        pipelines.specialize(&pipeline_cache, &build_mesh_uniforms_pipeline, ());
-    build_mesh_uniforms_pipeline.pipeline_id = Some(build_mesh_uniforms_pipeline_id);
+    let preprocess_pipeline_id = pipelines.specialize(&pipeline_cache, &preprocess_pipeline, ());
+    preprocess_pipeline.pipeline_id = Some(preprocess_pipeline_id);
 }
 
 /// A system that attaches the mesh uniform buffers to the bind group for the
 /// compute shader.
-pub fn prepare_build_mesh_uniforms_bind_group(
+pub fn prepare_preprocess_bind_groups(
+    mut commands: Commands,
     render_device: Res<RenderDevice>,
     batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
-    pipeline: Res<BuildMeshUniformsPipeline>,
-    mut bind_group: ResMut<BuildMeshUniformsBindGroup>,
+    pipeline: Res<PreprocessPipeline>,
 ) {
     // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh
     // uniform building, bail out.
     let BatchedInstanceBuffers::GpuBuilt {
         data_buffer: ref data_buffer_vec,
-        index_buffer: ref index_buffer_vec,
+        work_item_buffers: ref index_buffers,
         current_input_buffer: ref current_input_buffer_vec,
         previous_input_buffer: ref previous_input_buffer_vec,
-        index_count: _,
     } = *batched_instance_buffers
     else {
         return;
     };
 
-    let (
-        Some(current_input_buffer),
-        Some(previous_input_buffer),
-        Some(index_buffer),
-        Some(data_buffer),
-    ) = (
+    let (Some(current_input_buffer), Some(previous_input_buffer), Some(data_buffer)) = (
         current_input_buffer_vec.buffer(),
         previous_input_buffer_vec.buffer(),
-        index_buffer_vec.buffer(),
         data_buffer_vec.buffer(),
-    )
-    else {
+    ) else {
         return;
     };
 
-    bind_group.0 = Some(render_device.create_bind_group(
-        "build_mesh_uniforms_bind_group",
-        &pipeline.bind_group_layout,
-        &BindGroupEntries::sequential((
-            current_input_buffer.as_entire_binding(),
-            previous_input_buffer.as_entire_binding(),
-            index_buffer.as_entire_binding(),
-            data_buffer.as_entire_binding(),
-        )),
-    ));
+    for (view, index_buffer_vec) in index_buffers {
+        let Some(index_buffer) = index_buffer_vec.buffer.buffer() else {
+            continue;
+        };
+        commands
+            .entity(*view)
+            .insert(PreprocessBindGroup(render_device.create_bind_group(
+                "preprocess_bind_group",
+                &pipeline.bind_group_layout,
+                &BindGroupEntries::sequential((
+                    current_input_buffer.as_entire_binding(),
+                    previous_input_buffer.as_entire_binding(),
+                    index_buffer.as_entire_binding(),
+                    data_buffer.as_entire_binding(),
+                )),
+            )));
+    }
 }
 
 /// Returns `a / b`, rounded toward positive infinity.
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 8c696f68c8847..eef11a5174caa 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -15,13 +15,15 @@ use bevy_ecs::{
 use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
     batching::{
-        batch_and_prepare_binned_render_phase, batch_and_prepare_sorted_render_phase,
-        sort_binned_render_phase, write_batched_instance_buffer, BatchedInstanceBuffers,
+        clear_batched_instance_buffers, write_batched_instance_buffer, BatchedInstanceBuffers,
         GetBatchData, GetBinnedBatchData, NoAutomaticBatching,
     },
     mesh::*,
     render_asset::RenderAssets,
-    render_phase::{PhaseItem, RenderCommand, RenderCommandResult, TrackedRenderPass},
+    render_phase::{
+        BinnedRenderPhasePlugin, PhaseItem, RenderCommand, RenderCommandResult,
+        SortedRenderPhasePlugin, TrackedRenderPass,
+    },
     render_resource::*,
     renderer::{RenderDevice, RenderQueue},
     texture::{BevyDefault, DefaultImageSampler, GpuImage, ImageSampler, TextureFormatPixelInfo},
@@ -123,7 +125,16 @@ impl Plugin for MeshRenderPlugin {
         app.add_systems(
             PostUpdate,
             (no_automatic_skin_batching, no_automatic_morph_batching),
-        );
+        )
+        .add_plugins((
+            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::default(),
+            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::default(),
+            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::default(),
+        ));
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
             let render_mesh_instances = RenderMeshInstances::new(self.use_gpu_uniform_builder);
@@ -136,30 +147,13 @@ impl Plugin for MeshRenderPlugin {
                 .init_resource::<MorphIndices>()
                 .insert_resource(render_mesh_instances)
                 .add_systems(ExtractSchedule, (extract_skins, extract_morphs))
+                .add_systems(
+                    ExtractSchedule,
+                    clear_batched_instance_buffers::<MeshPipeline>.before(ExtractMeshesSet),
+                )
                 .add_systems(
                     Render,
                     (
-                        (
-                            sort_binned_render_phase::<Opaque3d>,
-                            sort_binned_render_phase::<AlphaMask3d>,
-                            sort_binned_render_phase::<Shadow>,
-                            sort_binned_render_phase::<Opaque3dDeferred>,
-                            sort_binned_render_phase::<AlphaMask3dDeferred>,
-                        )
-                            .in_set(RenderSet::PhaseSort),
-                        (
-                            batch_and_prepare_binned_render_phase::<Opaque3d, MeshPipeline>,
-                            batch_and_prepare_sorted_render_phase::<Transmissive3d, MeshPipeline>,
-                            batch_and_prepare_sorted_render_phase::<Transparent3d, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<AlphaMask3d, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<Shadow, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<Opaque3dDeferred, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<
-                                AlphaMask3dDeferred,
-                                MeshPipeline,
-                            >,
-                        )
-                            .in_set(RenderSet::PrepareResources),
                         write_batched_instance_buffer::<MeshPipeline>
                             .in_set(RenderSet::PrepareResourcesFlush),
                         prepare_skins.in_set(RenderSet::PrepareResources),
@@ -896,7 +890,7 @@ impl GetBatchData for MeshPipeline {
         ))
     }
 
-    fn get_batch_index(
+    fn get_batch_input_index(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(u32, Option<Self::CompareData>)> {
@@ -950,7 +944,7 @@ impl GetBinnedBatchData for MeshPipeline {
         ))
     }
 
-    fn get_batch_index(
+    fn get_batch_input_index(
         (mesh_instances, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<u32> {
diff --git a/crates/bevy_pbr/src/render/build_mesh_uniforms.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
similarity index 76%
rename from crates/bevy_pbr/src/render/build_mesh_uniforms.wgsl
rename to crates/bevy_pbr/src/render/mesh_preprocess.wgsl
index a013e057fbf68..133b8d12dab12 100644
--- a/crates/bevy_pbr/src/render/build_mesh_uniforms.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -23,6 +23,11 @@ struct MeshInput {
     previous_input_index: u32,
 }
 
+struct PreprocessWorkItem {
+    input_index: u32,
+    output_index: u32,
+}
+
 // The current frame's `MeshInput`.
 @group(0) @binding(0) var<storage> current_input: array<MeshInput>;
 // The `MeshInput` values from the previous frame.
@@ -30,7 +35,7 @@ struct MeshInput {
 // Indices into the `MeshInput` buffer.
 //
 // There may be many indices that map to the same `MeshInput`.
-@group(0) @binding(2) var<storage> indices: array<u32>;
+@group(0) @binding(2) var<storage> work_items: array<PreprocessWorkItem>;
 // The output array of `Mesh`es.
 @group(0) @binding(3) var<storage, read_write> output: array<Mesh>;
 
@@ -38,12 +43,13 @@ struct MeshInput {
 @workgroup_size(64)
 fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
     let instance_index = global_invocation_id.x;
-    if (instance_index >= arrayLength(&output)) {
+    if (instance_index >= arrayLength(&work_items)) {
         return;
     }
 
     // Unpack.
-    let mesh_index = indices[instance_index];
+    let mesh_index = work_items[instance_index].input_index;
+    let output_index = work_items[instance_index].output_index;
     let model_affine_transpose = current_input[mesh_index].model;
     let model = maths::affine3_to_square(model_affine_transpose);
 
@@ -67,10 +73,10 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
     }
 
     // Write the output.
-    output[instance_index].model = model_affine_transpose;
-    output[instance_index].previous_model = previous_model;
-    output[instance_index].inverse_transpose_model_a = inverse_transpose_model_a;
-    output[instance_index].inverse_transpose_model_b = inverse_transpose_model_b;
-    output[instance_index].flags = current_input[mesh_index].flags;
-    output[instance_index].lightmap_uv_rect = current_input[mesh_index].lightmap_uv_rect;
+    output[output_index].model = model_affine_transpose;
+    output[output_index].previous_model = previous_model;
+    output[output_index].inverse_transpose_model_a = inverse_transpose_model_a;
+    output[output_index].inverse_transpose_model_b = inverse_transpose_model_b;
+    output[output_index].flags = current_input[mesh_index].flags;
+    output[output_index].lightmap_uv_rect = current_input[mesh_index].lightmap_uv_rect;
 }
diff --git a/crates/bevy_pbr/src/render/mod.rs b/crates/bevy_pbr/src/render/mod.rs
index ea5b4158bd946..53bc9bcde14b4 100644
--- a/crates/bevy_pbr/src/render/mod.rs
+++ b/crates/bevy_pbr/src/render/mod.rs
@@ -1,5 +1,5 @@
-mod build_mesh_uniforms;
 mod fog;
+mod gpu_preprocess;
 mod light;
 pub(crate) mod mesh;
 mod mesh_bindings;
@@ -7,8 +7,8 @@ mod mesh_view_bindings;
 mod morph;
 mod skin;
 
-pub use build_mesh_uniforms::*;
 pub use fog::*;
+pub use gpu_preprocess::*;
 pub use light::*;
 pub use mesh::*;
 pub use mesh_bindings::MeshLayouts;
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 6f1e468f8212d..1903eef708ade 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use std::{marker::PhantomData, mem};
 
 use bevy_ecs::{
     component::Component,
@@ -6,7 +6,9 @@ use bevy_ecs::{
     prelude::Res,
     system::{Query, ResMut, Resource, StaticSystemParam, SystemParam, SystemParamItem},
 };
-use bytemuck::Pod;
+use bevy_utils::EntityHashMap;
+use bytemuck::{Pod, Zeroable};
+use encase::ShaderType;
 use nonmax::NonMaxU32;
 use smallvec::{smallvec, SmallVec};
 use wgpu::{BindingResource, BufferUsages};
@@ -99,10 +101,8 @@ where
         /// The index of the buffer data in the current input buffer that
         /// corresponds to each instance.
         ///
-        /// It's entirely possible for indices to be duplicated in this list.
-        /// This typically occurs when an entity is visible from multiple views:
-        /// e.g. the main camera plus a shadow map.
-        index_buffer: BufferVec<u32>,
+        /// This is keyed off each view. Each view has a separate buffer.
+        work_item_buffers: EntityHashMap<Entity, PreprocessWorkItemBuffer>,
 
         /// The uniform data inputs for the current frame.
         ///
@@ -117,15 +117,26 @@ where
         /// data input uniform is expected to contain the index of the
         /// corresponding buffer data input uniform in this list.
         previous_input_buffer: BufferVec<BDI>,
-
-        /// The number of indices this frame.
-        ///
-        /// This is different from `index_buffer.len()` because `index_buffer`
-        /// gets cleared during `write_batched_instance_buffer`.
-        index_count: usize,
     },
 }
 
+/// The buffer of GPU preprocessing work items for a single view.
+pub struct PreprocessWorkItemBuffer {
+    /// The buffer of work items.
+    pub buffer: BufferVec<PreprocessWorkItem>,
+    /// True if we've populated the buffer this frame.
+    ///
+    /// We use this so that we can delete unused buffers when views disappear.
+    pub rendered: bool,
+}
+
+#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
+#[repr(C)]
+pub struct PreprocessWorkItem {
+    pub input_index: u32,
+    pub output_index: u32,
+}
+
 impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
 where
     BD: GpuArrayBufferable + Sync + Send + 'static,
@@ -139,10 +150,9 @@ where
 
         BatchedInstanceBuffers::GpuBuilt {
             data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
-            index_buffer: BufferVec::new(BufferUsages::STORAGE),
+            work_item_buffers: EntityHashMap::default(),
             current_input_buffer: BufferVec::new(BufferUsages::STORAGE),
             previous_input_buffer: BufferVec::new(BufferUsages::STORAGE),
-            index_count: 0,
         }
     }
 
@@ -189,19 +199,20 @@ pub trait GetBatchData {
     /// for the `CompareData`.
     ///
     /// This is only called when building uniforms on CPU. In the GPU uniform
-    /// building path, we use [`GetBatchData::get_batch_index`] instead.
+    /// building path, we use [`GetBatchData::get_batch_preprocess_work_item`]
+    /// instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)>;
-    /// Returns the index of the mesh instance in the buffer, if GPU uniform
-    /// building is in use.
+    /// Returns the index of the [`GetBinnedBatchData::BufferInputData`] that
+    /// the GPU preprocessing phase will use.
     ///
-    /// This needs only the index, because we already inserted the
-    /// [`GetBatchData::BufferInputData`] during the extraction phase before we
-    /// got here. If CPU uniform building is in use, this function will never be
-    /// called.
-    fn get_batch_index(
+    /// We already inserted the [`GetBatchData::BufferInputData`] during the
+    /// extraction phase before we got here, so this function shouldn't need to
+    /// look up any render data. If CPU uniform building is in use, this
+    /// function will never be called.
+    fn get_batch_input_index(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(u32, Option<Self::CompareData>)>;
@@ -227,22 +238,61 @@ pub trait GetBinnedBatchData {
     type BufferInputData: Pod + Sync + Send;
 
     /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
+    ///
+    /// This is only called when building uniforms on CPU. In the GPU uniform
+    /// building path, we use [`GetBatchData::get_batch_preprocess_work_item`]
+    /// instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<Self::BufferData>;
-    /// Returns the index of the mesh instance in the buffer, if GPU uniform
-    /// building is in use.
+    /// Returns the index of the [`GetBinnedBatchData::BufferInputData`] that
+    /// the GPU preprocessing phase will use.
     ///
-    /// This needs only the index, because we already inserted the
-    /// [`GetBatchData::BufferInputData`] during the extraction phase before we
-    /// got here. If CPU uniform building is in use, this function will never be
-    /// called.
-    fn get_batch_index(param: &SystemParamItem<Self::Param>, query_item: Entity) -> Option<u32>;
+    /// We already inserted the [`GetBinnedBatchData::BufferInputData`] during
+    /// the extraction phase before we got here, so this function shouldn't need
+    /// to look up any render data. If CPU uniform building is in use, this
+    /// function will never be called.
+    fn get_batch_input_index(
+        param: &SystemParamItem<Self::Param>,
+        query_item: Entity,
+    ) -> Option<u32>;
 }
 
-/// Batch the items in a sorted render phase. This means comparing metadata
-/// needed to draw each phase item and trying to combine the draws into a batch.
+/// A system that runs early in extraction and clears out all the
+/// [`BatchedInstanceBuffers`] for the frame.
+pub fn clear_batched_instance_buffers<F>(
+    mut gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
+) where
+    F: GetBatchData,
+{
+    match *gpu_array_buffer {
+        BatchedInstanceBuffers::CpuBuilt(ref mut buffer) => {
+            buffer.clear();
+        }
+        BatchedInstanceBuffers::GpuBuilt {
+            ref mut data_buffer,
+            ref mut work_item_buffers,
+            ref mut current_input_buffer,
+            ref mut previous_input_buffer,
+        } => {
+            data_buffer.clear();
+            current_input_buffer.clear();
+            previous_input_buffer.clear();
+
+            // Clear out all work item buffers. And, if the buffer wasn't
+            // touched at all last frame, delete it.
+            work_item_buffers.retain(|_, work_item_buffers| {
+                work_item_buffers.buffer.clear();
+                mem::take(&mut work_item_buffers.rendered)
+            });
+        }
+    }
+}
+
+/// Batch the items in a sorted render phase, when GPU uniform building isn't in
+/// use. This means comparing metadata needed to draw each phase item and trying
+/// to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, F>(
     gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
     mut views: Query<&mut SortedRenderPhase<I>>,
@@ -254,35 +304,13 @@ pub fn batch_and_prepare_sorted_render_phase<I, F>(
     let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
-    let mut process_item = |item: &mut I| {
-        let compare_data = match gpu_array_buffer {
-            BatchedInstanceBuffers::CpuBuilt(ref mut buffer) => {
-                let (buffer_data, compare_data) =
-                    F::get_batch_data(&system_param_item, item.entity())?;
-                let buffer_index = buffer.push(buffer_data);
+    let process_item = |item: &mut I, buffer: &mut GpuArrayBuffer<F::BufferData>| {
+        let (buffer_data, compare_data) = F::get_batch_data(&system_param_item, item.entity())?;
+        let buffer_index = buffer.push(buffer_data);
 
-                let index = buffer_index.index;
-                *item.batch_range_mut() = index..index + 1;
-                *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
-
-                compare_data
-            }
-
-            BatchedInstanceBuffers::GpuBuilt {
-                index_buffer,
-                data_buffer,
-                ..
-            } => {
-                let (batch_index, compare_data) =
-                    F::get_batch_index(&system_param_item, item.entity())?;
-                let index_buffer_index = index_buffer.push(batch_index) as u32;
-                let data_buffer_index = data_buffer.add() as u32;
-                debug_assert_eq!(index_buffer_index, data_buffer_index);
-                *item.batch_range_mut() = data_buffer_index..data_buffer_index + 1;
-
-                compare_data
-            }
-        };
+        let index = buffer_index.index;
+        *item.batch_range_mut() = index..index + 1;
+        *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
 
         if I::AUTOMATIC_BATCHING {
             compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
@@ -292,8 +320,86 @@ pub fn batch_and_prepare_sorted_render_phase<I, F>(
     };
 
     for mut phase in &mut views {
+        // We only process CPU-built batch data in this function.
+        let BatchedInstanceBuffers::CpuBuilt(ref mut buffer) = gpu_array_buffer else {
+            continue;
+        };
+
         let items = phase.items.iter_mut().map(|item| {
-            let batch_data = process_item(item);
+            let batch_data = process_item(item, buffer);
+            (item.batch_range_mut(), batch_data)
+        });
+        items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
+            if batch_meta.is_some() && prev_batch_meta == batch_meta {
+                start_range.end = range.end;
+                (start_range, prev_batch_meta)
+            } else {
+                (range, batch_meta)
+            }
+        });
+    }
+}
+
+/// Batch the items in a sorted render phase, when GPU uniform building isn't in
+/// use. This means comparing metadata needed to draw each phase item and trying
+/// to combine the draws into a batch.
+pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, F>(
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
+    mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
+    param: StaticSystemParam<F::Param>,
+) where
+    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    F: GetBatchData,
+{
+    let gpu_array_buffer = gpu_array_buffer.into_inner();
+    let system_param_item = param.into_inner();
+
+    let process_item =
+        |item: &mut I,
+         data_buffer: &mut UninitBufferVec<F::BufferData>,
+         work_item_buffer: &mut BufferVec<PreprocessWorkItem>| {
+            let (input_index, compare_data) =
+                F::get_batch_input_index(&system_param_item, item.entity())?;
+            let output_index = data_buffer.add() as u32;
+
+            work_item_buffer.push(PreprocessWorkItem {
+                input_index,
+                output_index,
+            });
+
+            *item.batch_range_mut() = output_index..output_index + 1;
+
+            if I::AUTOMATIC_BATCHING {
+                compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
+            } else {
+                None
+            }
+        };
+
+    for (view, mut phase) in &mut views {
+        // We only process GPU-built batch data in this function.
+        let BatchedInstanceBuffers::GpuBuilt {
+            ref mut data_buffer,
+            ref mut work_item_buffers,
+            ..
+        } = gpu_array_buffer
+        else {
+            continue;
+        };
+
+        // Create the work item buffer if necessary; otherwise, just mark it as
+        // used this frame.
+        let work_item_buffer =
+            work_item_buffers
+                .entry(view)
+                .or_insert_with(|| PreprocessWorkItemBuffer {
+                    buffer: BufferVec::new(BufferUsages::STORAGE),
+                    rendered: true,
+                });
+        work_item_buffer.rendered = true;
+
+        let items = phase.items.iter_mut().map(|item| {
+            let batch_data = process_item(item, data_buffer, &mut work_item_buffer.buffer);
             (item.batch_range_mut(), batch_data)
         });
         items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
@@ -318,7 +424,8 @@ where
     }
 }
 
-/// Creates batches for a render phase that uses bins.
+/// Creates batches for a render phase that uses bins, when GPU batch data
+/// building isn't in use.
 pub fn batch_and_prepare_binned_render_phase<BPI, GBBD>(
     gpu_array_buffer: ResMut<BatchedInstanceBuffers<GBBD::BufferData, GBBD::BufferInputData>>,
     mut views: Query<&mut BinnedRenderPhase<BPI>>,
@@ -333,18 +440,20 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GBBD>(
     for mut phase in &mut views {
         let phase = &mut *phase; // Borrow checker.
 
+        // We only process CPU-built batch data in this function.
+        let BatchedInstanceBuffers::CpuBuilt(ref mut buffer) = gpu_array_buffer else {
+            continue;
+        };
+
         // Prepare batchables.
 
         for key in &phase.batchable_keys {
             let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![];
             for &entity in &phase.batchable_values[key] {
-                let Some(instance) = add_batch_data_for_binned_render_phase::<GBBD>(
-                    entity,
-                    &system_param_item,
-                    gpu_array_buffer,
-                ) else {
+                let Some(buffer_data) = GBBD::get_batch_data(&system_param_item, entity) else {
                     continue;
                 };
+                let instance = buffer.push(buffer_data);
 
                 // If the dynamic offset has changed, flush the batch.
                 //
@@ -374,48 +483,106 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GBBD>(
         for key in &phase.unbatchable_keys {
             let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
             for &entity in &unbatchables.entities {
-                if let Some(instance) = add_batch_data_for_binned_render_phase::<GBBD>(
-                    entity,
-                    &system_param_item,
-                    gpu_array_buffer,
-                ) {
-                    unbatchables.buffer_indices.add(instance);
-                }
+                let Some(buffer_data) = GBBD::get_batch_data(&system_param_item, entity) else {
+                    continue;
+                };
+                let instance = buffer.push(buffer_data);
+                unbatchables.buffer_indices.add(instance);
             }
         }
     }
 }
 
-/// Adds the batch data necessary to render one instance of an entity that's in
-/// a binned render phase.
-fn add_batch_data_for_binned_render_phase<GBBD>(
-    entity: Entity,
-    system_param_item: &<GBBD::Param as SystemParam>::Item<'_, '_>,
-    gpu_array_buffer: &mut BatchedInstanceBuffers<GBBD::BufferData, GBBD::BufferInputData>,
-) -> Option<GpuArrayBufferIndex<GBBD::BufferData>>
-where
+/// Creates batches for a render phase that uses bins.
+pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
+    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GBBD::BufferData, GBBD::BufferInputData>>,
+    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
+    param: StaticSystemParam<GBBD::Param>,
+) where
+    BPI: BinnedPhaseItem,
     GBBD: GetBinnedBatchData,
 {
-    match *gpu_array_buffer {
-        BatchedInstanceBuffers::CpuBuilt(ref mut buffer) => {
-            let buffer_data = GBBD::get_batch_data(system_param_item, entity)?;
-            Some(buffer.push(buffer_data))
-        }
+    let gpu_array_buffer = gpu_array_buffer.into_inner();
+    let system_param_item = param.into_inner();
 
-        BatchedInstanceBuffers::GpuBuilt {
-            ref mut index_buffer,
+    for (view, mut phase) in &mut views {
+        let phase = &mut *phase; // Borrow checker.
+
+        // We only process GPU-built batch data in this function.
+        let BatchedInstanceBuffers::GpuBuilt {
             ref mut data_buffer,
+            ref mut work_item_buffers,
             ..
-        } => {
-            let batch_index = GBBD::get_batch_index(system_param_item, entity)?;
-            let index_buffer_index = index_buffer.push(batch_index) as u32;
-            let data_buffer_index = data_buffer.add() as u32;
-            debug_assert_eq!(index_buffer_index, data_buffer_index);
-            Some(GpuArrayBufferIndex {
-                index: index_buffer_index,
-                dynamic_offset: None,
-                element_type: PhantomData,
-            })
+        } = gpu_array_buffer
+        else {
+            continue;
+        };
+
+        // Create the work item buffer if necessary; otherwise, just mark it as
+        // used this frame.
+        let work_item_buffer =
+            work_item_buffers
+                .entry(view)
+                .or_insert_with(|| PreprocessWorkItemBuffer {
+                    buffer: BufferVec::new(BufferUsages::STORAGE),
+                    rendered: true,
+                });
+        work_item_buffer.rendered = true;
+
+        // Prepare batchables.
+
+        for key in &phase.batchable_keys {
+            let mut batch: Option<BinnedRenderPhaseBatch> = None;
+            for &entity in &phase.batchable_values[key] {
+                let Some(input_index) = GBBD::get_batch_input_index(&system_param_item, entity)
+                else {
+                    continue;
+                };
+                let output_index = data_buffer.add() as u32;
+
+                work_item_buffer.buffer.push(PreprocessWorkItem {
+                    input_index,
+                    output_index,
+                });
+
+                batch
+                    .get_or_insert(BinnedRenderPhaseBatch {
+                        representative_entity: entity,
+                        instance_range: output_index..output_index,
+                        dynamic_offset: None,
+                    })
+                    .instance_range
+                    .end = output_index + 1;
+            }
+
+            if let Some(batch) = batch {
+                phase.batch_sets.push(smallvec![batch]);
+            }
+        }
+
+        // Prepare unbatchables.
+        for key in &phase.unbatchable_keys {
+            let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
+            for &entity in &unbatchables.entities {
+                let Some(input_index) = GBBD::get_batch_input_index(&system_param_item, entity)
+                else {
+                    continue;
+                };
+                let output_index = data_buffer.add() as u32;
+
+                work_item_buffer.buffer.push(PreprocessWorkItem {
+                    input_index,
+                    output_index,
+                });
+
+                unbatchables
+                    .buffer_indices
+                    .add(GpuArrayBufferIndex::<GBBD::BufferData> {
+                        index: output_index,
+                        dynamic_offset: None,
+                        element_type: PhantomData,
+                    });
+            }
         }
     }
 }
@@ -429,29 +596,23 @@ pub fn write_batched_instance_buffer<F: GetBatchData>(
     match gpu_array_buffer {
         BatchedInstanceBuffers::CpuBuilt(ref mut gpu_array_buffer) => {
             gpu_array_buffer.write_buffer(&render_device, &render_queue);
-            gpu_array_buffer.clear();
         }
         BatchedInstanceBuffers::GpuBuilt {
             ref mut data_buffer,
-            ref mut index_buffer,
+            work_item_buffers: ref mut index_buffers,
             ref mut current_input_buffer,
-            ref mut index_count,
             previous_input_buffer: _,
         } => {
             data_buffer.write_buffer(&render_device);
-            index_buffer.write_buffer(&render_device, &render_queue);
-
-            // Save the index count before we clear it out. Rendering will need
-            // it.
-            *index_count = index_buffer.len();
-
             current_input_buffer.write_buffer(&render_device, &render_queue);
             // There's no need to write `previous_input_buffer`, as we wrote
             // that on the previous frame, and it hasn't changed.
 
-            data_buffer.clear();
-            index_buffer.clear();
-            current_input_buffer.clear();
+            for index_buffer in index_buffers.values_mut() {
+                index_buffer
+                    .buffer
+                    .write_buffer(&render_device, &render_queue);
+            }
         }
     }
 }
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index 40c4153f3fde2..76d7bc579f189 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -29,6 +29,7 @@ mod draw;
 mod draw_state;
 mod rangefinder;
 
+use bevy_app::{App, Plugin};
 use bevy_utils::{default, hashbrown::hash_map::Entry, HashMap};
 pub use draw::*;
 pub use draw_state::*;
@@ -36,13 +37,17 @@ use encase::{internal::WriteInto, ShaderSize};
 use nonmax::NonMaxU32;
 pub use rangefinder::*;
 
-use crate::render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache};
+use crate::{
+    batching::{self, GetBatchData, GetBinnedBatchData},
+    render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache},
+    Render, RenderApp, RenderSet,
+};
 use bevy_ecs::{
     prelude::*,
     system::{lifetimeless::SRes, SystemParamItem},
 };
 use smallvec::SmallVec;
-use std::{hash::Hash, ops::Range, slice::SliceIndex};
+use std::{hash::Hash, marker::PhantomData, ops::Range, slice::SliceIndex};
 
 /// A collection of all rendering instructions, that will be executed by the GPU, for a
 /// single render phase for a single view.
@@ -291,6 +296,88 @@ where
     }
 }
 
+/// A convenient abstraction for adding all the systems necessary for a binned
+/// render phase to the render app.
+pub struct BinnedRenderPhasePlugin<BPI, GBBD>(PhantomData<(BPI, GBBD)>)
+where
+    BPI: BinnedPhaseItem,
+    GBBD: GetBinnedBatchData;
+
+impl<BPI, GBBD> Default for BinnedRenderPhasePlugin<BPI, GBBD>
+where
+    BPI: BinnedPhaseItem,
+    GBBD: GetBinnedBatchData,
+{
+    fn default() -> Self {
+        Self(PhantomData)
+    }
+}
+
+impl<BPI, GBBD> Plugin for BinnedRenderPhasePlugin<BPI, GBBD>
+where
+    BPI: BinnedPhaseItem,
+    GBBD: GetBinnedBatchData + Sync + Send + 'static,
+{
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.add_systems(
+            Render,
+            (
+                batching::sort_binned_render_phase::<BPI>.in_set(RenderSet::PhaseSort),
+                (
+                    batching::batch_and_prepare_binned_render_phase::<BPI, GBBD>,
+                    batching::batch_and_prepare_binned_render_phase_for_gpu_preprocessing::<
+                        BPI,
+                        GBBD,
+                    >,
+                )
+                    .in_set(RenderSet::PrepareResources),
+            ),
+        );
+    }
+}
+
+/// A convenient abstraction for adding all the systems necessary for a sorted
+/// render phase to the render app.
+pub struct SortedRenderPhasePlugin<SPI, GBD>(PhantomData<(SPI, GBD)>)
+where
+    SPI: SortedPhaseItem,
+    GBD: GetBatchData;
+
+impl<SPI, GBD> Default for SortedRenderPhasePlugin<SPI, GBD>
+where
+    SPI: SortedPhaseItem,
+    GBD: GetBatchData,
+{
+    fn default() -> Self {
+        Self(PhantomData)
+    }
+}
+
+impl<SPI, GBD> Plugin for SortedRenderPhasePlugin<SPI, GBD>
+where
+    SPI: SortedPhaseItem + CachedRenderPipelinePhaseItem,
+    GBD: GetBatchData + Sync + Send + 'static,
+{
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.add_systems(
+            Render,
+            (
+                batching::batch_and_prepare_sorted_render_phase::<SPI, GBD>,
+                batching::batch_and_prepare_sorted_render_phase_for_gpu_preprocessing::<SPI, GBD>,
+            )
+                .in_set(RenderSet::PrepareResources),
+        );
+    }
+}
+
 impl UnbatchableBinnedEntityBufferIndex {
     /// Adds a new entity to the list of unbatchable binned entities.
     pub fn add<T>(&mut self, gpu_array_buffer_index: GpuArrayBufferIndex<T>)
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 590ce18dd5d7d..287db60ca1261 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -365,7 +365,7 @@ impl GetBatchData for Mesh2dPipeline {
         ))
     }
 
-    fn get_batch_index(
+    fn get_batch_input_index(
         _: &SystemParamItem<Self::Param>,
         _: Entity,
     ) -> Option<(u32, Option<Self::CompareData>)> {

From 510a93a8cb7038100968b942edb82207959dadf1 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Tue, 2 Apr 2024 01:07:58 -0700
Subject: [PATCH 11/39] Split out deletion of buffers to try to fix crashes

---
 crates/bevy_pbr/src/render/mesh.rs     | 10 ++++++---
 crates/bevy_render/src/batching/mod.rs | 29 +++++++++++++++++++-------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index eef11a5174caa..769ffa79d81e6 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -15,8 +15,9 @@ use bevy_ecs::{
 use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
     batching::{
-        clear_batched_instance_buffers, write_batched_instance_buffer, BatchedInstanceBuffers,
-        GetBatchData, GetBinnedBatchData, NoAutomaticBatching,
+        clear_batched_instance_buffers, delete_old_work_item_buffers,
+        write_batched_instance_buffer, BatchedInstanceBuffers, GetBatchData, GetBinnedBatchData,
+        NoAutomaticBatching,
     },
     mesh::*,
     render_asset::RenderAssets,
@@ -27,7 +28,7 @@ use bevy_render::{
     render_resource::*,
     renderer::{RenderDevice, RenderQueue},
     texture::{BevyDefault, DefaultImageSampler, GpuImage, ImageSampler, TextureFormatPixelInfo},
-    view::{ViewTarget, ViewUniformOffset, ViewVisibility},
+    view::{prepare_view_targets, ViewTarget, ViewUniformOffset, ViewVisibility},
     Extract,
 };
 use bevy_transform::components::GlobalTransform;
@@ -154,6 +155,9 @@ impl Plugin for MeshRenderPlugin {
                 .add_systems(
                     Render,
                     (
+                        delete_old_work_item_buffers::<MeshPipeline>
+                            .in_set(RenderSet::ManageViews)
+                            .after(prepare_view_targets),
                         write_batched_instance_buffer::<MeshPipeline>
                             .in_set(RenderSet::PrepareResourcesFlush),
                         prepare_skins.in_set(RenderSet::PrepareResources),
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 1903eef708ade..853499a15aa29 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -1,9 +1,10 @@
-use std::{marker::PhantomData, mem};
+use std::marker::PhantomData;
 
 use bevy_ecs::{
     component::Component,
     entity::Entity,
     prelude::Res,
+    query::With,
     system::{Query, ResMut, Resource, StaticSystemParam, SystemParam, SystemParamItem},
 };
 use bevy_utils::EntityHashMap;
@@ -23,6 +24,7 @@ use crate::{
         UninitBufferVec,
     },
     renderer::{RenderDevice, RenderQueue},
+    view::ViewTarget,
 };
 
 /// Add this component to mesh entities to disable automatic batching
@@ -279,17 +281,28 @@ pub fn clear_batched_instance_buffers<F>(
             data_buffer.clear();
             current_input_buffer.clear();
             previous_input_buffer.clear();
-
-            // Clear out all work item buffers. And, if the buffer wasn't
-            // touched at all last frame, delete it.
-            work_item_buffers.retain(|_, work_item_buffers| {
-                work_item_buffers.buffer.clear();
-                mem::take(&mut work_item_buffers.rendered)
-            });
+            for work_item_buffer in work_item_buffers.values_mut() {
+                work_item_buffer.buffer.clear();
+            }
         }
     }
 }
 
+pub fn delete_old_work_item_buffers<F>(
+    mut gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
+    view_targets: Query<Entity, With<ViewTarget>>,
+) where
+    F: GetBatchData,
+{
+    if let BatchedInstanceBuffers::GpuBuilt {
+        ref mut work_item_buffers,
+        ..
+    } = *gpu_array_buffer
+    {
+        work_item_buffers.retain(|entity, _| view_targets.contains(*entity));
+    }
+}
+
 /// Batch the items in a sorted render phase, when GPU uniform building isn't in
 /// use. This means comparing metadata needed to draw each phase item and trying
 /// to combine the draws into a batch.

From 5bbc05b24ddff59e5696a7f3a49c68c41ffe450d Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Tue, 2 Apr 2024 12:33:49 -0700
Subject: [PATCH 12/39] Add some documentation

---
 crates/bevy_pbr/src/render/mesh_preprocess.wgsl |  4 ++++
 crates/bevy_render/src/batching/mod.rs          | 15 +++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
index 133b8d12dab12..b0c76e7086c15 100644
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -23,8 +23,12 @@ struct MeshInput {
     previous_input_index: u32,
 }
 
+// One invocation of this compute shader: i.e. one mesh instance in a view.
 struct PreprocessWorkItem {
+    // The index of the `MeshInput` in the `current_input` buffer that we read
+    // from.
     input_index: u32,
+    // The index of the `Mesh` in `output` that we write to.
     output_index: u32,
 }
 
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 853499a15aa29..7deb58a60c178 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -132,10 +132,15 @@ pub struct PreprocessWorkItemBuffer {
     pub rendered: bool,
 }
 
+/// One invocation of the preprocessing shader: i.e. one mesh instance in a
+/// view.
 #[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
 #[repr(C)]
 pub struct PreprocessWorkItem {
+    /// The index of the batch input data in the input buffer that the shader
+    /// reads from.
     pub input_index: u32,
+    /// The index of the `MeshUniform` in the output buffer that we write to.
     pub output_index: u32,
 }
 
@@ -263,6 +268,10 @@ pub trait GetBinnedBatchData {
 
 /// A system that runs early in extraction and clears out all the
 /// [`BatchedInstanceBuffers`] for the frame.
+///
+/// We have to run this during extraction because, if GPU preprocessing is in
+/// use, the extraction phase will write to the mesh input uniform buffers
+/// directly, so the buffers need to be cleared before then.
 pub fn clear_batched_instance_buffers<F>(
     mut gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
 ) where
@@ -288,6 +297,12 @@ pub fn clear_batched_instance_buffers<F>(
     }
 }
 
+/// A system that removes GPU preprocessing work item buffers that correspond to
+/// deleted [`ViewTarget`]s.
+///
+/// This is a separate system from [`clear_batched_instance_buffers`] because
+/// [`ViewTarget`]s aren't created until after the extraction phase is
+/// completed.
 pub fn delete_old_work_item_buffers<F>(
     mut gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
     view_targets: Query<Entity, With<ViewTarget>>,

From 95f1de49756e2c297170c73bb0b8d74544e90b19 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Tue, 2 Apr 2024 12:34:08 -0700
Subject: [PATCH 13/39] Rustfmt

---
 crates/bevy_pbr/src/lib.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index 517711e21e035..1f5b11e2e8cb2 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -372,14 +372,16 @@ impl Plugin for PbrPlugin {
             app.add_plugins(GpuMeshPreprocessPlugin);
         }
 
-        app.world_mut().resource_mut::<Assets<StandardMaterial>>().insert(
-            &Handle::<StandardMaterial>::default(),
-            StandardMaterial {
-                base_color: Color::srgb(1.0, 0.0, 0.5),
-                unlit: true,
-                ..Default::default()
-            },
-        );
+        app.world_mut()
+            .resource_mut::<Assets<StandardMaterial>>()
+            .insert(
+                &Handle::<StandardMaterial>::default(),
+                StandardMaterial {
+                    base_color: Color::srgb(1.0, 0.0, 0.5),
+                    unlit: true,
+                    ..Default::default()
+                },
+            );
 
         let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
             return;

From 1fc3d81a2aa29f6db1c40644d1ff7963d1bb515a Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Wed, 3 Apr 2024 01:26:30 -0700
Subject: [PATCH 14/39] Remove some obsolete code

---
 crates/bevy_pbr/src/render/gpu_preprocess.rs |  5 +--
 crates/bevy_render/src/batching/mod.rs       | 46 +++++---------------
 2 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 0e9ae84c600dd..628c13075d32c 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -149,7 +149,6 @@ impl Node for GpuPreprocessNode {
         let Some(preprocess_pipeline) = pipeline_cache.get_compute_pipeline(preprocess_pipeline_id)
         else {
             // This will happen while the pipeline is being compiled and is fine.
-            println!("No compute pipeline present!");
             return Ok(());
         };
 
@@ -171,7 +170,7 @@ impl Node for GpuPreprocessNode {
             };
 
             compute_pass.set_bind_group(0, &bind_group.0, &[]);
-            let workgroup_count = div_round_up(index_buffer.buffer.len(), WORKGROUP_SIZE);
+            let workgroup_count = div_round_up(index_buffer.len(), WORKGROUP_SIZE);
             compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
         }
 
@@ -267,7 +266,7 @@ pub fn prepare_preprocess_bind_groups(
     };
 
     for (view, index_buffer_vec) in index_buffers {
-        let Some(index_buffer) = index_buffer_vec.buffer.buffer() else {
+        let Some(index_buffer) = index_buffer_vec.buffer() else {
             continue;
         };
         commands
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 7deb58a60c178..8ecdc6ee0f87f 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -104,7 +104,7 @@ where
         /// corresponds to each instance.
         ///
         /// This is keyed off each view. Each view has a separate buffer.
-        work_item_buffers: EntityHashMap<Entity, PreprocessWorkItemBuffer>,
+        work_item_buffers: EntityHashMap<Entity, BufferVec<PreprocessWorkItem>>,
 
         /// The uniform data inputs for the current frame.
         ///
@@ -122,16 +122,6 @@ where
     },
 }
 
-/// The buffer of GPU preprocessing work items for a single view.
-pub struct PreprocessWorkItemBuffer {
-    /// The buffer of work items.
-    pub buffer: BufferVec<PreprocessWorkItem>,
-    /// True if we've populated the buffer this frame.
-    ///
-    /// We use this so that we can delete unused buffers when views disappear.
-    pub rendered: bool,
-}
-
 /// One invocation of the preprocessing shader: i.e. one mesh instance in a
 /// view.
 #[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
@@ -291,7 +281,7 @@ pub fn clear_batched_instance_buffers<F>(
             current_input_buffer.clear();
             previous_input_buffer.clear();
             for work_item_buffer in work_item_buffers.values_mut() {
-                work_item_buffer.buffer.clear();
+                work_item_buffer.clear();
             }
         }
     }
@@ -417,17 +407,12 @@ pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, F>(
 
         // Create the work item buffer if necessary; otherwise, just mark it as
         // used this frame.
-        let work_item_buffer =
-            work_item_buffers
-                .entry(view)
-                .or_insert_with(|| PreprocessWorkItemBuffer {
-                    buffer: BufferVec::new(BufferUsages::STORAGE),
-                    rendered: true,
-                });
-        work_item_buffer.rendered = true;
+        let work_item_buffer = work_item_buffers
+            .entry(view)
+            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
 
         let items = phase.items.iter_mut().map(|item| {
-            let batch_data = process_item(item, data_buffer, &mut work_item_buffer.buffer);
+            let batch_data = process_item(item, data_buffer, work_item_buffer);
             (item.batch_range_mut(), batch_data)
         });
         items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
@@ -548,14 +533,9 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
 
         // Create the work item buffer if necessary; otherwise, just mark it as
         // used this frame.
-        let work_item_buffer =
-            work_item_buffers
-                .entry(view)
-                .or_insert_with(|| PreprocessWorkItemBuffer {
-                    buffer: BufferVec::new(BufferUsages::STORAGE),
-                    rendered: true,
-                });
-        work_item_buffer.rendered = true;
+        let work_item_buffer = work_item_buffers
+            .entry(view)
+            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
 
         // Prepare batchables.
 
@@ -568,7 +548,7 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
                 };
                 let output_index = data_buffer.add() as u32;
 
-                work_item_buffer.buffer.push(PreprocessWorkItem {
+                work_item_buffer.push(PreprocessWorkItem {
                     input_index,
                     output_index,
                 });
@@ -598,7 +578,7 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
                 };
                 let output_index = data_buffer.add() as u32;
 
-                work_item_buffer.buffer.push(PreprocessWorkItem {
+                work_item_buffer.push(PreprocessWorkItem {
                     input_index,
                     output_index,
                 });
@@ -637,9 +617,7 @@ pub fn write_batched_instance_buffer<F: GetBatchData>(
             // that on the previous frame, and it hasn't changed.
 
             for index_buffer in index_buffers.values_mut() {
-                index_buffer
-                    .buffer
-                    .write_buffer(&render_device, &render_queue);
+                index_buffer.write_buffer(&render_device, &render_queue);
             }
         }
     }

From e9024ea457dfe0b7a783a3ae39cf68fc9bcb34f9 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Wed, 3 Apr 2024 01:27:25 -0700
Subject: [PATCH 15/39] Address review comment

---
 crates/bevy_pbr/src/prepass/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index d798140cd188e..1e52f8beb1303 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -163,7 +163,7 @@ where
                 .add_systems(ExtractSchedule, extract_camera_previous_view_projection)
                 .add_systems(
                     Render,
-                    (prepare_previous_view_projection_uniforms).in_set(RenderSet::PrepareResources),
+                    prepare_previous_view_projection_uniforms.in_set(RenderSet::PrepareResources),
                 );
         }
 

From a5a492de3344e4abba2fc2c085ace5f9f9cf8fe4 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Wed, 3 Apr 2024 15:41:28 -0700
Subject: [PATCH 16/39] Address review comments

---
 crates/bevy_pbr/src/lib.rs             |  14 +--
 crates/bevy_pbr/src/render/mesh.rs     |  16 ++--
 crates/bevy_render/src/batching/mod.rs | 115 ++++++++++++-------------
 crates/bevy_sprite/src/mesh2d/mesh.rs  |   2 +-
 4 files changed, 73 insertions(+), 74 deletions(-)

diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index 1f5b11e2e8cb2..08438fca791e4 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -79,7 +79,7 @@ pub mod graph {
         /// Label for the screen space ambient occlusion render node.
         ScreenSpaceAmbientOcclusion,
         DeferredLightingPass,
-        /// Label for the compute shader mesh uniforms building pass.
+        /// Label for the compute shader instance data building pass.
         GpuPreprocess,
     }
 }
@@ -138,7 +138,7 @@ pub struct PbrPlugin {
     /// Controls if GPU [`MeshUniform`] building is enabled.
     ///
     /// This requires compute shader support.
-    pub use_gpu_uniform_builder: bool,
+    pub use_gpu_instance_buffer_builder: bool,
 }
 
 impl Default for PbrPlugin {
@@ -147,9 +147,9 @@ impl Default for PbrPlugin {
             prepass_enabled: true,
             add_default_deferred_lighting_plugin: true,
 
-            // The GPU uniform builder requires compute shaders, which aren't
-            // available on any version of WebGL.
-            use_gpu_uniform_builder: cfg!(any(
+            // The GPU instance buffer builder requires compute shaders, which
+            // aren't available on any version of WebGL.
+            use_gpu_instance_buffer_builder: cfg!(any(
                 feature = "webgpu",
                 not(feature = "webgl"),
                 not(target_arch = "wasm32"),
@@ -295,7 +295,7 @@ impl Plugin for PbrPlugin {
             .init_resource::<DefaultOpaqueRendererMethod>()
             .add_plugins((
                 MeshRenderPlugin {
-                    use_gpu_uniform_builder: self.use_gpu_uniform_builder,
+                    use_gpu_instance_buffer_builder: self.use_gpu_instance_buffer_builder,
                 },
                 MaterialPlugin::<StandardMaterial> {
                     prepass_enabled: self.prepass_enabled,
@@ -368,7 +368,7 @@ impl Plugin for PbrPlugin {
             app.add_plugins(DeferredPbrLightingPlugin);
         }
 
-        if self.use_gpu_uniform_builder {
+        if self.use_gpu_instance_buffer_builder {
             app.add_plugins(GpuMeshPreprocessPlugin);
         }
 
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index cc0d03edf39fe..7e6a9548492dc 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -56,7 +56,7 @@ pub struct MeshRenderPlugin {
     /// Whether we're building [`MeshUniform`]s on GPU.
     ///
     /// If this is false, we're building them on CPU.
-    pub use_gpu_uniform_builder: bool,
+    pub use_gpu_instance_buffer_builder: bool,
 }
 
 pub const FORWARD_IO_HANDLE: Handle<Shader> = Handle::weak_from_u128(2645551199423808407);
@@ -83,7 +83,7 @@ pub const MESH_PIPELINE_VIEW_LAYOUT_SAFE_MAX_TEXTURES: usize = 10;
 impl Default for MeshRenderPlugin {
     fn default() -> Self {
         Self {
-            use_gpu_uniform_builder: true,
+            use_gpu_instance_buffer_builder: true,
         }
     }
 }
@@ -139,7 +139,7 @@ impl Plugin for MeshRenderPlugin {
         ));
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            let render_mesh_instances = RenderMeshInstances::new(self.use_gpu_uniform_builder);
+            let render_mesh_instances = RenderMeshInstances::new(self.use_gpu_instance_buffer_builder);
 
             render_app
                 .init_resource::<MeshBindGroups>()
@@ -168,7 +168,7 @@ impl Plugin for MeshRenderPlugin {
                     ),
                 );
 
-            if self.use_gpu_uniform_builder {
+            if self.use_gpu_instance_buffer_builder {
                 render_app.add_systems(
                     ExtractSchedule,
                     extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
@@ -190,7 +190,7 @@ impl Plugin for MeshRenderPlugin {
             let batched_instance_buffers =
                 BatchedInstanceBuffers::<MeshUniform, MeshInputUniform>::new(
                     render_device,
-                    self.use_gpu_uniform_builder,
+                    self.use_gpu_instance_buffer_builder,
                 );
 
             if let Some(per_object_buffer_batch_size) =
@@ -458,8 +458,8 @@ pub enum RenderMeshInstances {
 }
 
 impl RenderMeshInstances {
-    fn new(use_gpu_uniform_builder: bool) -> RenderMeshInstances {
-        if use_gpu_uniform_builder {
+    fn new(use_gpu_instance_buffer_builder: bool) -> RenderMeshInstances {
+        if use_gpu_instance_buffer_builder {
             RenderMeshInstances::GpuBuilding(EntityHashMap::default())
         } else {
             RenderMeshInstances::CpuBuilding(EntityHashMap::default())
@@ -1503,7 +1503,7 @@ pub fn prepare_mesh_bind_group(
 ) {
     groups.reset();
     let layouts = &mesh_pipeline.mesh_layouts;
-    let Some(model) = mesh_uniforms.uniform_binding() else {
+    let Some(model) = mesh_uniforms.instance_data_binding() else {
         return;
     };
     groups.model_only = Some(layouts.model_only(&render_device, &model));
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 8ecdc6ee0f87f..b1343a7288565 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -140,8 +140,8 @@ where
     BDI: Pod,
 {
     /// Creates new buffers.
-    pub fn new(render_device: &RenderDevice, use_gpu_uniform_builder: bool) -> Self {
-        if !use_gpu_uniform_builder {
+    pub fn new(render_device: &RenderDevice, use_gpu_instance_buffer_builder: bool) -> Self {
+        if !use_gpu_instance_buffer_builder {
             return BatchedInstanceBuffers::CpuBuilt(GpuArrayBuffer::new(render_device));
         }
 
@@ -153,12 +153,11 @@ where
         }
     }
 
-    /// Returns the binding of the uniform buffer that contains the per-instance
-    /// data.
+    /// Returns the binding of the buffer that contains the per-instance data.
     ///
-    /// If we're in the GPU uniform building mode, this buffer needs to be
-    /// filled in via a compute shader.
-    pub fn uniform_binding(&self) -> Option<BindingResource> {
+    /// If we're in the GPU instance buffer building mode, this buffer needs to
+    /// be filled in via a compute shader.
+    pub fn instance_data_binding(&self) -> Option<BindingResource> {
         match *self {
             BatchedInstanceBuffers::CpuBuilt(ref buffer) => buffer.binding(),
             BatchedInstanceBuffers::GpuBuilt {
@@ -186,18 +185,18 @@ pub trait GetBatchData {
     /// The per-instance data that was inserted into the [`BufferVec`] during
     /// extraction.
     ///
-    /// This is only used when building uniforms on GPU. If this pipeline
-    /// doesn't support GPU uniform building (e.g. the 2D mesh pipeline), this
-    /// can safely be `()`.
+    /// This is only used when building instance data on GPU. If this pipeline
+    /// doesn't support GPU instance buffer building (e.g. the 2D mesh
+    /// pipeline), this can safely be `()`.
     type BufferInputData: Pod + Sync + Send;
     /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
     /// If the instance can be batched, also return the data used for
     /// comparison when deciding whether draws can be batched, else return None
     /// for the `CompareData`.
     ///
-    /// This is only called when building uniforms on CPU. In the GPU uniform
-    /// building path, we use [`GetBatchData::get_batch_preprocess_work_item`]
-    /// instead.
+    /// This is only called when building instance data on CPU. In the GPU
+    /// uniform building path, we use
+    /// [`GetBatchData::get_batch_preprocess_work_item`] instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
@@ -207,7 +206,7 @@ pub trait GetBatchData {
     ///
     /// We already inserted the [`GetBatchData::BufferInputData`] during the
     /// extraction phase before we got here, so this function shouldn't need to
-    /// look up any render data. If CPU uniform building is in use, this
+    /// look up any render data. If CPU instance buffer building is in use, this
     /// function will never be called.
     fn get_batch_input_index(
         param: &SystemParamItem<Self::Param>,
@@ -229,16 +228,16 @@ pub trait GetBinnedBatchData {
     /// The per-instance data that was inserted into the [`BufferVec`] during
     /// extraction.
     ///
-    /// This is only used when building uniforms on GPU. If this pipeline
-    /// doesn't support GPU uniform building (e.g. the 2D mesh pipeline), this
-    /// can safely be `()`.
+    /// This is only used when building instance buffers on GPU. If this
+    /// pipeline doesn't support GPU instance buffer building (e.g. the 2D mesh
+    /// pipeline), this can safely be `()`.
     type BufferInputData: Pod + Sync + Send;
 
     /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
     ///
-    /// This is only called when building uniforms on CPU. In the GPU uniform
-    /// building path, we use [`GetBatchData::get_batch_preprocess_work_item`]
-    /// instead.
+    /// This is only called when building uniforms on CPU. In the GPU instance
+    /// buffer building path, we use
+    /// [`GetBatchData::get_batch_preprocess_work_item`] instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         entity: Entity,
@@ -248,8 +247,8 @@ pub trait GetBinnedBatchData {
     ///
     /// We already inserted the [`GetBinnedBatchData::BufferInputData`] during
     /// the extraction phase before we got here, so this function shouldn't need
-    /// to look up any render data. If CPU uniform building is in use, this
-    /// function will never be called.
+    /// to look up any render data. If CPU instance buffer building is in use,
+    /// this function will never be called.
     fn get_batch_input_index(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
@@ -308,9 +307,9 @@ pub fn delete_old_work_item_buffers<F>(
     }
 }
 
-/// Batch the items in a sorted render phase, when GPU uniform building isn't in
-/// use. This means comparing metadata needed to draw each phase item and trying
-/// to combine the draws into a batch.
+/// Batch the items in a sorted render phase, when GPU instance buffer building
+/// isn't in use. This means comparing metadata needed to draw each phase item
+/// and trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, F>(
     gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
     mut views: Query<&mut SortedRenderPhase<I>>,
@@ -337,12 +336,12 @@ pub fn batch_and_prepare_sorted_render_phase<I, F>(
         }
     };
 
-    for mut phase in &mut views {
-        // We only process CPU-built batch data in this function.
-        let BatchedInstanceBuffers::CpuBuilt(ref mut buffer) = gpu_array_buffer else {
-            continue;
-        };
+    // We only process CPU-built batch data in this function.
+    let BatchedInstanceBuffers::CpuBuilt(ref mut buffer) = gpu_array_buffer else {
+        return;
+    };
 
+    for mut phase in &mut views {
         let items = phase.items.iter_mut().map(|item| {
             let batch_data = process_item(item, buffer);
             (item.batch_range_mut(), batch_data)
@@ -358,9 +357,9 @@ pub fn batch_and_prepare_sorted_render_phase<I, F>(
     }
 }
 
-/// Batch the items in a sorted render phase, when GPU uniform building isn't in
-/// use. This means comparing metadata needed to draw each phase item and trying
-/// to combine the draws into a batch.
+/// Batch the items in a sorted render phase, when GPU instance buffer building
+/// isn't in use. This means comparing metadata needed to draw each phase item
+/// and trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, F>(
     gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
     mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
@@ -394,17 +393,17 @@ pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, F>(
             }
         };
 
-    for (view, mut phase) in &mut views {
-        // We only process GPU-built batch data in this function.
-        let BatchedInstanceBuffers::GpuBuilt {
-            ref mut data_buffer,
-            ref mut work_item_buffers,
-            ..
-        } = gpu_array_buffer
-        else {
-            continue;
-        };
+    // We only process GPU-built batch data in this function.
+    let BatchedInstanceBuffers::GpuBuilt {
+        ref mut data_buffer,
+        ref mut work_item_buffers,
+        ..
+    } = gpu_array_buffer
+    else {
+        return;
+    };
 
+    for (view, mut phase) in &mut views {
         // Create the work item buffer if necessary; otherwise, just mark it as
         // used this frame.
         let work_item_buffer = work_item_buffers
@@ -450,14 +449,14 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GBBD>(
     let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
+    // We only process CPU-built batch data in this function.
+    let BatchedInstanceBuffers::CpuBuilt(ref mut buffer) = gpu_array_buffer else {
+        return;
+    };
+
     for mut phase in &mut views {
         let phase = &mut *phase; // Borrow checker.
 
-        // We only process CPU-built batch data in this function.
-        let BatchedInstanceBuffers::CpuBuilt(ref mut buffer) = gpu_array_buffer else {
-            continue;
-        };
-
         // Prepare batchables.
 
         for key in &phase.batchable_keys {
@@ -518,19 +517,19 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
     let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
+    // We only process GPU-built batch data in this function.
+    let BatchedInstanceBuffers::GpuBuilt {
+        ref mut data_buffer,
+        ref mut work_item_buffers,
+        ..
+    } = gpu_array_buffer
+    else {
+        return;
+    };
+
     for (view, mut phase) in &mut views {
         let phase = &mut *phase; // Borrow checker.
 
-        // We only process GPU-built batch data in this function.
-        let BatchedInstanceBuffers::GpuBuilt {
-            ref mut data_buffer,
-            ref mut work_item_buffers,
-            ..
-        } = gpu_array_buffer
-        else {
-            continue;
-        };
-
         // Create the work item buffer if necessary; otherwise, just mark it as
         // used this frame.
         let work_item_buffer = work_item_buffers
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index 287db60ca1261..a3fd0aa98694b 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -589,7 +589,7 @@ pub fn prepare_mesh2d_bind_group(
     render_device: Res<RenderDevice>,
     mesh2d_uniforms: Res<BatchedInstanceBuffers<Mesh2dUniform, ()>>,
 ) {
-    if let Some(binding) = mesh2d_uniforms.uniform_binding() {
+    if let Some(binding) = mesh2d_uniforms.instance_data_binding() {
         commands.insert_resource(Mesh2dBindGroup {
             value: render_device.create_bind_group(
                 "mesh2d_bind_group",

From 0dd53a4871cfe156b21d428ad4cbd797b3155e0f Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Wed, 3 Apr 2024 15:44:04 -0700
Subject: [PATCH 17/39] Rustfmt

---
 crates/bevy_pbr/src/render/mesh.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 7e6a9548492dc..e8bf5f6e6a119 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -139,7 +139,8 @@ impl Plugin for MeshRenderPlugin {
         ));
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            let render_mesh_instances = RenderMeshInstances::new(self.use_gpu_instance_buffer_builder);
+            let render_mesh_instances =
+                RenderMeshInstances::new(self.use_gpu_instance_buffer_builder);
 
             render_app
                 .init_resource::<MeshBindGroups>()

From 309a7b68455135a5a4b12de8932de197f53dbff3 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Thu, 4 Apr 2024 17:16:44 -0700
Subject: [PATCH 18/39] Fix the issue reported by @Elabajaba.

The problem was subtle. We were passing the entire work item buffer to
the preprocessing shader via `as_entire_binding()`. The trouble is that
`BufferVec` won't reallocate a GPU side buffer if it's already big
enough to hold the underlying contents. The `as_entire_binding()` method
uses the *currently-allocated GPU size*, not the CPU size, of the
buffer. In the preprocessing shader, we use `arrayLength` on the work
item buffer so that threads with invocation IDs greater than the length
of that buffer return early without corrupting the results. The
`arrayLength` function, like `as_entire_binding()`, checks the *GPU
size*, not the CPU size, of the buffer. So, if the buffer managed to
shrink from frame to frame, then a few threads in the final workgroup
would be reading past the conceptual end of the buffer into last frame's
data. This would result in graphical corruption.

This patch fixes the issue by using a `BindingResource` with the
explicit size set instead of `as_entire_binding()`. A comment has been
added explaining the situation.
---
 crates/bevy_pbr/src/render/gpu_preprocess.rs | 25 ++++++++++++++++----
 crates/bevy_render/src/batching/mod.rs       |  4 ++--
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 628c13075d32c..53aa6f09dc9bd 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -6,6 +6,8 @@
 //! [`MeshInputUniform`]s instead and use the GPU to calculate the remaining
 //! derived fields in [`MeshUniform`].
 
+use std::num::NonZeroU64;
+
 use bevy_app::{App, Plugin};
 use bevy_asset::{load_internal_asset, Handle};
 use bevy_core_pipeline::core_3d::graph::Core3d;
@@ -22,10 +24,10 @@ use bevy_render::{
     render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
     render_resource::{
         binding_types::{storage_buffer, storage_buffer_read_only},
-        BindGroup, BindGroupEntries, BindGroupLayout, CachedComputePipelineId,
-        ComputePassDescriptor, ComputePipelineDescriptor, DynamicBindGroupLayoutEntries,
-        PipelineCache, Shader, ShaderStages, SpecializedComputePipeline,
-        SpecializedComputePipelines,
+        BindGroup, BindGroupEntries, BindGroupLayout, BindingResource, BufferBinding,
+        CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor,
+        DynamicBindGroupLayoutEntries, PipelineCache, Shader, ShaderStages, ShaderType,
+        SpecializedComputePipeline, SpecializedComputePipelines,
     },
     renderer::{RenderContext, RenderDevice},
     Render, RenderApp, RenderSet,
@@ -269,6 +271,15 @@ pub fn prepare_preprocess_bind_groups(
         let Some(index_buffer) = index_buffer_vec.buffer() else {
             continue;
         };
+
+        // Don't use `as_entire_binding()` here; the shader reads the array
+        // length and the underlying buffer may be longer than the actual size
+        // of the vector.
+        let index_buffer_size = NonZeroU64::try_from(
+            index_buffer_vec.len() as u64 * u64::from(PreprocessWorkItem::min_size()),
+        )
+        .ok();
+
         commands
             .entity(*view)
             .insert(PreprocessBindGroup(render_device.create_bind_group(
@@ -277,7 +288,11 @@ pub fn prepare_preprocess_bind_groups(
                 &BindGroupEntries::sequential((
                     current_input_buffer.as_entire_binding(),
                     previous_input_buffer.as_entire_binding(),
-                    index_buffer.as_entire_binding(),
+                    BindingResource::Buffer(BufferBinding {
+                        buffer: index_buffer,
+                        offset: 0,
+                        size: index_buffer_size,
+                    }),
                     data_buffer.as_entire_binding(),
                 )),
             )));
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index b1343a7288565..679c8518719e3 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -201,8 +201,8 @@ pub trait GetBatchData {
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)>;
-    /// Returns the index of the [`GetBinnedBatchData::BufferInputData`] that
-    /// the GPU preprocessing phase will use.
+    /// Returns the index of the [`GetBatchData::BufferInputData`] that the GPU
+    /// preprocessing phase will use.
     ///
     /// We already inserted the [`GetBatchData::BufferInputData`] during the
     /// extraction phase before we got here, so this function shouldn't need to

From 2b5155cdc71f2dbcc5d8b7dfc9f762ab51826067 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Thu, 4 Apr 2024 20:52:20 -0700
Subject: [PATCH 19/39] Doc check police

---
 crates/bevy_pbr/src/render/mesh.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index e8bf5f6e6a119..b097c36a1debb 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -262,6 +262,7 @@ pub struct MeshInputUniform {
     pub transform: [Vec4; 3],
     /// Four 16-bit unsigned normalized UV values packed into a `UVec2`:
     ///
+    /// ```text
     ///                         <--- MSB                   LSB --->
     ///                         +---- min v ----+ +---- min u ----+
     ///     lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu,
@@ -269,6 +270,7 @@ pub struct MeshInputUniform {
     ///     lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU,
     ///
     /// (MSB: most significant bit; LSB: least significant bit.)
+    /// ```
     pub lightmap_uv_rect: UVec2,
     /// Various [`MeshFlags`].
     pub flags: u32,
@@ -398,6 +400,7 @@ pub struct RenderMeshInstanceGpuBuilder {
     pub transform: Affine3,
     /// Four 16-bit unsigned normalized UV values packed into a [`UVec2`]:
     ///
+    /// ```text
     ///                         <--- MSB                   LSB --->
     ///                         +---- min v ----+ +---- min u ----+
     ///     lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu,
@@ -405,6 +408,7 @@ pub struct RenderMeshInstanceGpuBuilder {
     ///     lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU,
     ///
     /// (MSB: most significant bit; LSB: least significant bit.)
+    /// ```
     pub lightmap_uv_rect: UVec2,
     /// Various flags.
     pub mesh_flags: MeshFlags,

From d2a170c89eb0c289ee60cbd0d64730b5e2251acb Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Fri, 5 Apr 2024 00:34:07 -0700
Subject: [PATCH 20/39] Make sure the compiler never sees `BufferVec<()>`.

This commit makes the following splits:

* `BatchedInstanceBuffers` becomes `BatchedCpuBuiltInstanceBuffers` and
  `BatchedGpuBuiltInstanceBuffers`.

* `GetBatchData` becomes `GetBatchData` and `GetBatchInputData`.

* `GetBinnedBatchData` becomes `GetBinnedBatchData` and
  `GetBinnedBatchInputData`.

These changes ensure that the generic expansion of the 2D mesh code
never produces a `BufferVec<()>`.
---
 crates/bevy_pbr/src/render/gpu_preprocess.rs |  22 +-
 crates/bevy_pbr/src/render/mesh.rs           |  96 ++++--
 crates/bevy_render/src/batching/mod.rs       | 336 ++++++++++---------
 crates/bevy_render/src/render_phase/mod.rs   | 100 +++++-
 crates/bevy_sprite/src/mesh2d/mesh.rs        |  33 +-
 5 files changed, 364 insertions(+), 223 deletions(-)

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 53aa6f09dc9bd..beadf8793b0a2 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -20,7 +20,7 @@ use bevy_ecs::{
     world::{FromWorld, World},
 };
 use bevy_render::{
-    batching::{BatchedInstanceBuffers, PreprocessWorkItem},
+    batching::{BatchedGpuBuiltInstanceBuffers, PreprocessWorkItem},
     render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
     render_resource::{
         binding_types::{storage_buffer, storage_buffer_read_only},
@@ -128,10 +128,10 @@ impl Node for GpuPreprocessNode {
     ) -> Result<(), NodeRunError> {
         // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh
         // uniform building, bail out.
-        let BatchedInstanceBuffers::GpuBuilt {
+        let Some(BatchedGpuBuiltInstanceBuffers {
             work_item_buffers: ref index_buffers,
             ..
-        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
+        }) = world.get_resource::<BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>>()
         else {
             error!(
                 "Attempted to preprocess meshes on GPU, but `GpuBuilt` batched instance buffers \
@@ -244,20 +244,22 @@ pub fn prepare_preprocess_pipeline(
 pub fn prepare_preprocess_bind_groups(
     mut commands: Commands,
     render_device: Res<RenderDevice>,
-    batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    gpu_batched_instance_buffers: Option<
+        Res<BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    >,
     pipeline: Res<PreprocessPipeline>,
 ) {
-    // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh
+    // Grab the [`BatchedGpuBuiltInstanceBuffers`]. If we aren't using GPU mesh
     // uniform building, bail out.
-    let BatchedInstanceBuffers::GpuBuilt {
+    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
+        return;
+    };
+    let BatchedGpuBuiltInstanceBuffers {
         data_buffer: ref data_buffer_vec,
         work_item_buffers: ref index_buffers,
         current_input_buffer: ref current_input_buffer_vec,
         previous_input_buffer: ref previous_input_buffer_vec,
-    } = *batched_instance_buffers
-    else {
-        return;
-    };
+    } = gpu_batched_instance_buffers.into_inner();
 
     let (Some(current_input_buffer), Some(previous_input_buffer), Some(data_buffer)) = (
         current_input_buffer_vec.buffer(),
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index b097c36a1debb..c28ecefc1b2ee 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -16,14 +16,15 @@ use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
     batching::{
         clear_batched_instance_buffers, delete_old_work_item_buffers,
-        write_batched_instance_buffer, BatchedInstanceBuffers, GetBatchData, GetBinnedBatchData,
-        NoAutomaticBatching,
+        write_cpu_built_batched_instance_buffers, write_gpu_built_batched_instance_buffers,
+        BatchedCpuBuiltInstanceBuffer, BatchedGpuBuiltInstanceBuffers, GetBatchData,
+        GetBatchInputData, GetBinnedBatchData, GetBinnedBatchInputData, NoAutomaticBatching,
     },
     mesh::*,
     render_asset::RenderAssets,
     render_phase::{
-        BinnedRenderPhasePlugin, PhaseItem, RenderCommand, RenderCommandResult,
-        SortedRenderPhasePlugin, TrackedRenderPass,
+        BinnedRenderPhaseGpuPreprocessingPlugin, PhaseItem, RenderCommand, RenderCommandResult,
+        SortedRenderPhaseGpuPreprocessingPlugin, TrackedRenderPass,
     },
     render_resource::*,
     renderer::{RenderDevice, RenderQueue},
@@ -129,13 +130,13 @@ impl Plugin for MeshRenderPlugin {
             (no_automatic_skin_batching, no_automatic_morph_batching),
         )
         .add_plugins((
-            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::default(),
-            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::default(),
-            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::default(),
-            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::default(),
-            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::default(),
-            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::default(),
-            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::default(),
+            BinnedRenderPhaseGpuPreprocessingPlugin::<Opaque3d, MeshPipeline>::default(),
+            BinnedRenderPhaseGpuPreprocessingPlugin::<AlphaMask3d, MeshPipeline>::default(),
+            BinnedRenderPhaseGpuPreprocessingPlugin::<Shadow, MeshPipeline>::default(),
+            BinnedRenderPhaseGpuPreprocessingPlugin::<Opaque3dDeferred, MeshPipeline>::default(),
+            BinnedRenderPhaseGpuPreprocessingPlugin::<AlphaMask3dDeferred, MeshPipeline>::default(),
+            SortedRenderPhaseGpuPreprocessingPlugin::<Transmissive3d, MeshPipeline>::default(),
+            SortedRenderPhaseGpuPreprocessingPlugin::<Transparent3d, MeshPipeline>::default(),
         ));
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
@@ -160,7 +161,9 @@ impl Plugin for MeshRenderPlugin {
                         delete_old_work_item_buffers::<MeshPipeline>
                             .in_set(RenderSet::ManageViews)
                             .after(prepare_view_targets),
-                        write_batched_instance_buffer::<MeshPipeline>
+                        write_cpu_built_batched_instance_buffers::<MeshPipeline>
+                            .in_set(RenderSet::PrepareResourcesFlush),
+                        write_gpu_built_batched_instance_buffers::<MeshPipeline>
                             .in_set(RenderSet::PrepareResourcesFlush),
                         prepare_skins.in_set(RenderSet::PrepareResources),
                         prepare_morphs.in_set(RenderSet::PrepareResources),
@@ -187,13 +190,18 @@ impl Plugin for MeshRenderPlugin {
         let mut mesh_bindings_shader_defs = Vec::with_capacity(1);
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            let render_device = render_app.world().resource::<RenderDevice>();
-            let batched_instance_buffers =
-                BatchedInstanceBuffers::<MeshUniform, MeshInputUniform>::new(
-                    render_device,
-                    self.use_gpu_instance_buffer_builder,
-                );
+            if self.use_gpu_instance_buffer_builder {
+                render_app
+                    .init_resource::<BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>>(
+                    );
+            } else {
+                let render_device = render_app.world().resource::<RenderDevice>();
+                let cpu_batched_instance_buffer =
+                    BatchedCpuBuiltInstanceBuffer::<MeshUniform>::new(render_device);
+                render_app.insert_resource(cpu_batched_instance_buffer);
+            };
 
+            let render_device = render_app.world().resource::<RenderDevice>();
             if let Some(per_object_buffer_batch_size) =
                 GpuArrayBuffer::<MeshUniform>::batch_size(render_device)
             {
@@ -203,9 +211,7 @@ impl Plugin for MeshRenderPlugin {
                 ));
             }
 
-            render_app
-                .insert_resource(batched_instance_buffers)
-                .init_resource::<MeshPipeline>();
+            render_app.init_resource::<MeshPipeline>();
         }
 
         // Load the mesh_bindings shader module here as it depends on runtime information about
@@ -609,7 +615,9 @@ pub fn extract_meshes_for_cpu_building(
 /// [`MeshUniform`] building.
 pub fn extract_meshes_for_gpu_building(
     mut render_mesh_instances: ResMut<RenderMeshInstances>,
-    mut batched_instance_buffers: ResMut<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    mut gpu_batched_instance_buffers: ResMut<
+        BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>,
+    >,
     mut render_mesh_instance_queues: Local<Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>>,
     mut prev_render_mesh_instances: Local<EntityHashMap<RenderMeshInstanceGpu>>,
     meshes_query: Extract<
@@ -673,7 +681,7 @@ pub fn extract_meshes_for_gpu_building(
 
     collect_meshes_for_gpu_building(
         &mut render_mesh_instances,
-        &mut batched_instance_buffers,
+        &mut gpu_batched_instance_buffers,
         &mut render_mesh_instance_queues,
         &mut prev_render_mesh_instances,
     );
@@ -683,7 +691,10 @@ pub fn extract_meshes_for_gpu_building(
 /// uniforms are built.
 fn collect_meshes_for_gpu_building(
     render_mesh_instances: &mut RenderMeshInstances,
-    batched_instance_buffers: &mut BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
+    gpu_batched_instance_buffers: &mut BatchedGpuBuiltInstanceBuffers<
+        MeshUniform,
+        MeshInputUniform,
+    >,
     render_mesh_instance_queues: &mut Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>,
     prev_render_mesh_instances: &mut EntityHashMap<RenderMeshInstanceGpu>,
 ) {
@@ -696,14 +707,11 @@ fn collect_meshes_for_gpu_building(
         );
     };
 
-    let BatchedInstanceBuffers::GpuBuilt {
+    let BatchedGpuBuiltInstanceBuffers {
         ref mut current_input_buffer,
         ref mut previous_input_buffer,
         ..
-    } = *batched_instance_buffers
-    else {
-        unreachable!()
-    };
+    } = gpu_batched_instance_buffers;
 
     // Swap buffers.
     mem::swap(current_input_buffer, previous_input_buffer);
@@ -871,8 +879,6 @@ impl GetBatchData for MeshPipeline {
 
     type BufferData = MeshUniform;
 
-    type BufferInputData = MeshInputUniform;
-
     fn get_batch_data(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
@@ -899,6 +905,10 @@ impl GetBatchData for MeshPipeline {
             )),
         ))
     }
+}
+
+impl GetBatchInputData for MeshPipeline {
+    type BufferInputData = MeshInputUniform;
 
     fn get_batch_input_index(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
@@ -932,8 +942,6 @@ impl GetBinnedBatchData for MeshPipeline {
 
     type BufferData = MeshUniform;
 
-    type BufferInputData = MeshInputUniform;
-
     fn get_batch_data(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
@@ -953,6 +961,10 @@ impl GetBinnedBatchData for MeshPipeline {
             maybe_lightmap.map(|lightmap| lightmap.uv_rect),
         ))
     }
+}
+
+impl GetBinnedBatchInputData for MeshPipeline {
+    type BufferInputData = MeshInputUniform;
 
     fn get_batch_input_index(
         (mesh_instances, _): &SystemParamItem<Self::Param>,
@@ -1501,16 +1513,30 @@ pub fn prepare_mesh_bind_group(
     mut groups: ResMut<MeshBindGroups>,
     mesh_pipeline: Res<MeshPipeline>,
     render_device: Res<RenderDevice>,
-    mesh_uniforms: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    cpu_batched_instance_buffer: Option<Res<BatchedCpuBuiltInstanceBuffer<MeshUniform>>>,
+    gpu_batched_instance_buffers: Option<
+        Res<BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    >,
     skins_uniform: Res<SkinUniform>,
     weights_uniform: Res<MorphUniform>,
     render_lightmaps: Res<RenderLightmaps>,
 ) {
     groups.reset();
     let layouts = &mesh_pipeline.mesh_layouts;
-    let Some(model) = mesh_uniforms.instance_data_binding() else {
+
+    let model = if let Some(cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
+        cpu_batched_instance_buffer
+            .into_inner()
+            .instance_data_binding()
+    } else if let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
+        gpu_batched_instance_buffers
+            .into_inner()
+            .instance_data_binding()
+    } else {
         return;
     };
+    let Some(model) = model else { return };
+
     groups.model_only = Some(layouts.model_only(&render_device, &model));
 
     let skin = skins_uniform.buffer.buffer();
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 679c8518719e3..4c66f04bfec6b 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -1,5 +1,6 @@
 use std::marker::PhantomData;
 
+use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::{
     component::Component,
     entity::Entity,
@@ -81,45 +82,42 @@ impl<T: PartialEq> BatchMeta<T> {
 /// uses less GPU bus bandwidth, but only implemented for some pipelines (for
 /// example, not in the 2D pipeline at present) and only when compute shader is
 /// available.
+#[derive(Resource, Deref, DerefMut)]
+pub struct BatchedCpuBuiltInstanceBuffer<BD>(pub GpuArrayBuffer<BD>)
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static;
+
 #[derive(Resource)]
-pub enum BatchedInstanceBuffers<BD, BDI>
+pub struct BatchedGpuBuiltInstanceBuffers<BD, BDI>
 where
     BD: GpuArrayBufferable + Sync + Send + 'static,
     BDI: Pod,
 {
-    /// The single buffer containing instances, used when GPU uniform building
-    /// isn't available.
-    CpuBuilt(GpuArrayBuffer<BD>),
-
-    /// The buffers containing per-instance data used when GPU uniform building
-    /// is in use.
-    GpuBuilt {
-        /// A storage area for the buffer data that the GPU compute shader is
-        /// expected to write to.
-        ///
-        /// There will be one entry for each index.
-        data_buffer: UninitBufferVec<BD>,
-
-        /// The index of the buffer data in the current input buffer that
-        /// corresponds to each instance.
-        ///
-        /// This is keyed off each view. Each view has a separate buffer.
-        work_item_buffers: EntityHashMap<Entity, BufferVec<PreprocessWorkItem>>,
-
-        /// The uniform data inputs for the current frame.
-        ///
-        /// These are uploaded during the extraction phase.
-        current_input_buffer: BufferVec<BDI>,
-
-        /// The uniform data inputs for the previous frame.
-        ///
-        /// The indices don't generally line up between `current_input_buffer`
-        /// and `previous_input_buffer`, because, among other reasons, entities
-        /// can spawn or despawn between frames. Instead, each current buffer
-        /// data input uniform is expected to contain the index of the
-        /// corresponding buffer data input uniform in this list.
-        previous_input_buffer: BufferVec<BDI>,
-    },
+    /// A storage area for the buffer data that the GPU compute shader is
+    /// expected to write to.
+    ///
+    /// There will be one entry for each index.
+    pub data_buffer: UninitBufferVec<BD>,
+
+    /// The index of the buffer data in the current input buffer that
+    /// corresponds to each instance.
+    ///
+    /// This is keyed off each view. Each view has a separate buffer.
+    pub work_item_buffers: EntityHashMap<Entity, BufferVec<PreprocessWorkItem>>,
+
+    /// The uniform data inputs for the current frame.
+    ///
+    /// These are uploaded during the extraction phase.
+    pub current_input_buffer: BufferVec<BDI>,
+
+    /// The uniform data inputs for the previous frame.
+    ///
+    /// The indices don't generally line up between `current_input_buffer`
+    /// and `previous_input_buffer`, because, among other reasons, entities
+    /// can spawn or despawn between frames. Instead, each current buffer
+    /// data input uniform is expected to contain the index of the
+    /// corresponding buffer data input uniform in this list.
+    pub previous_input_buffer: BufferVec<BDI>,
 }
 
 /// One invocation of the preprocessing shader: i.e. one mesh instance in a
@@ -134,18 +132,32 @@ pub struct PreprocessWorkItem {
     pub output_index: u32,
 }
 
-impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
+impl<BD> BatchedCpuBuiltInstanceBuffer<BD>
 where
     BD: GpuArrayBufferable + Sync + Send + 'static,
-    BDI: Pod,
 {
     /// Creates new buffers.
-    pub fn new(render_device: &RenderDevice, use_gpu_instance_buffer_builder: bool) -> Self {
-        if !use_gpu_instance_buffer_builder {
-            return BatchedInstanceBuffers::CpuBuilt(GpuArrayBuffer::new(render_device));
-        }
+    pub fn new(render_device: &RenderDevice) -> Self {
+        BatchedCpuBuiltInstanceBuffer(GpuArrayBuffer::new(render_device))
+    }
+
+    /// Returns the binding of the buffer that contains the per-instance data.
+    ///
+    /// If we're in the GPU instance buffer building mode, this buffer needs to
+    /// be filled in via a compute shader.
+    pub fn instance_data_binding(&self) -> Option<BindingResource> {
+        self.binding()
+    }
+}
 
-        BatchedInstanceBuffers::GpuBuilt {
+impl<BD, BDI> BatchedGpuBuiltInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    /// Creates new buffers.
+    pub fn new() -> Self {
+        BatchedGpuBuiltInstanceBuffers {
             data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
             work_item_buffers: EntityHashMap::default(),
             current_input_buffer: BufferVec::new(BufferUsages::STORAGE),
@@ -155,17 +167,21 @@ where
 
     /// Returns the binding of the buffer that contains the per-instance data.
     ///
-    /// If we're in the GPU instance buffer building mode, this buffer needs to
-    /// be filled in via a compute shader.
+    /// This buffer needs to be filled in via a compute shader.
     pub fn instance_data_binding(&self) -> Option<BindingResource> {
-        match *self {
-            BatchedInstanceBuffers::CpuBuilt(ref buffer) => buffer.binding(),
-            BatchedInstanceBuffers::GpuBuilt {
-                ref data_buffer, ..
-            } => data_buffer
-                .buffer()
-                .map(|buffer| buffer.as_entire_binding()),
-        }
+        self.data_buffer
+            .buffer()
+            .map(|buffer| buffer.as_entire_binding())
+    }
+}
+
+impl<BD, BDI> Default for BatchedGpuBuiltInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    fn default() -> Self {
+        Self::new()
     }
 }
 
@@ -182,13 +198,6 @@ pub trait GetBatchData {
     /// The per-instance data to be inserted into the [`GpuArrayBuffer`]
     /// containing these data for all instances.
     type BufferData: GpuArrayBufferable + Sync + Send + 'static;
-    /// The per-instance data that was inserted into the [`BufferVec`] during
-    /// extraction.
-    ///
-    /// This is only used when building instance data on GPU. If this pipeline
-    /// doesn't support GPU instance buffer building (e.g. the 2D mesh
-    /// pipeline), this can safely be `()`.
-    type BufferInputData: Pod + Sync + Send;
     /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
     /// If the instance can be batched, also return the data used for
     /// comparison when deciding whether draws can be batched, else return None
@@ -201,6 +210,17 @@ pub trait GetBatchData {
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)>;
+}
+
+/// A trait to support getting data used for batching draw commands via phase
+/// items.
+///
+/// This is only used when GPU preprocessing is in use.
+pub trait GetBatchInputData: GetBatchData {
+    /// The per-instance data that was inserted into the [`BufferVec`] during
+    /// extraction.
+    type BufferInputData: Pod + Sync + Send;
+
     /// Returns the index of the [`GetBatchData::BufferInputData`] that the GPU
     /// preprocessing phase will use.
     ///
@@ -225,30 +245,31 @@ pub trait GetBinnedBatchData {
     /// The per-instance data to be inserted into the [`GpuArrayBuffer`]
     /// containing these data for all instances.
     type BufferData: GpuArrayBufferable + Sync + Send + 'static;
-    /// The per-instance data that was inserted into the [`BufferVec`] during
-    /// extraction.
-    ///
-    /// This is only used when building instance buffers on GPU. If this
-    /// pipeline doesn't support GPU instance buffer building (e.g. the 2D mesh
-    /// pipeline), this can safely be `()`.
-    type BufferInputData: Pod + Sync + Send;
 
     /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
     ///
     /// This is only called when building uniforms on CPU. In the GPU instance
     /// buffer building path, we use
-    /// [`GetBatchData::get_batch_preprocess_work_item`] instead.
+    /// [`GetBinnedBatchDataForGpuPreprocessing::get_batch_input_index`]
+    /// instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<Self::BufferData>;
+}
+
+/// Like [`GetBinnedBatchData`], but for GPU preprocessing.
+pub trait GetBinnedBatchInputData: GetBinnedBatchData {
+    /// The per-instance data that was inserted into the [`BufferVec`] during
+    /// extraction.
+    type BufferInputData: Pod + Sync + Send;
+
     /// Returns the index of the [`GetBinnedBatchData::BufferInputData`] that
     /// the GPU preprocessing phase will use.
     ///
     /// We already inserted the [`GetBinnedBatchData::BufferInputData`] during
     /// the extraction phase before we got here, so this function shouldn't need
-    /// to look up any render data. If CPU instance buffer building is in use,
-    /// this function will never be called.
+    /// to look up any render data.
     fn get_batch_input_index(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
@@ -261,27 +282,24 @@ pub trait GetBinnedBatchData {
 /// We have to run this during extraction because, if GPU preprocessing is in
 /// use, the extraction phase will write to the mesh input uniform buffers
 /// directly, so the buffers need to be cleared before then.
-pub fn clear_batched_instance_buffers<F>(
-    mut gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
+pub fn clear_batched_instance_buffers<GBID>(
+    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GBID::BufferData>>>,
+    gpu_batched_instance_buffers: Option<
+        ResMut<BatchedGpuBuiltInstanceBuffers<GBID::BufferData, GBID::BufferInputData>>,
+    >,
 ) where
-    F: GetBatchData,
+    GBID: GetBatchInputData,
 {
-    match *gpu_array_buffer {
-        BatchedInstanceBuffers::CpuBuilt(ref mut buffer) => {
-            buffer.clear();
-        }
-        BatchedInstanceBuffers::GpuBuilt {
-            ref mut data_buffer,
-            ref mut work_item_buffers,
-            ref mut current_input_buffer,
-            ref mut previous_input_buffer,
-        } => {
-            data_buffer.clear();
-            current_input_buffer.clear();
-            previous_input_buffer.clear();
-            for work_item_buffer in work_item_buffers.values_mut() {
-                work_item_buffer.clear();
-            }
+    if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
+        cpu_batched_instance_buffer.clear();
+    }
+
+    if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
+        gpu_batched_instance_buffers.data_buffer.clear();
+        gpu_batched_instance_buffers.current_input_buffer.clear();
+        gpu_batched_instance_buffers.previous_input_buffer.clear();
+        for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() {
+            work_item_buffer.clear();
         }
     }
 }
@@ -292,33 +310,32 @@ pub fn clear_batched_instance_buffers<F>(
 /// This is a separate system from [`clear_batched_instance_buffers`] because
 /// [`ViewTarget`]s aren't created until after the extraction phase is
 /// completed.
-pub fn delete_old_work_item_buffers<F>(
-    mut gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
+pub fn delete_old_work_item_buffers<GBID>(
+    gpu_batched_instance_buffers: Option<
+        ResMut<BatchedGpuBuiltInstanceBuffers<GBID::BufferData, GBID::BufferInputData>>,
+    >,
     view_targets: Query<Entity, With<ViewTarget>>,
 ) where
-    F: GetBatchData,
+    GBID: GetBatchInputData,
 {
-    if let BatchedInstanceBuffers::GpuBuilt {
-        ref mut work_item_buffers,
-        ..
-    } = *gpu_array_buffer
-    {
-        work_item_buffers.retain(|entity, _| view_targets.contains(*entity));
+    if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
+        gpu_batched_instance_buffers
+            .work_item_buffers
+            .retain(|entity, _| view_targets.contains(*entity));
     }
 }
 
 /// Batch the items in a sorted render phase, when GPU instance buffer building
 /// isn't in use. This means comparing metadata needed to draw each phase item
 /// and trying to combine the draws into a batch.
-pub fn batch_and_prepare_sorted_render_phase<I, F>(
-    gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
+pub fn batch_and_prepare_sorted_render_phase_no_gpu_preprocessing<I, F>(
+    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<F::BufferData>>>,
     mut views: Query<&mut SortedRenderPhase<I>>,
     param: StaticSystemParam<F::Param>,
 ) where
     I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
     F: GetBatchData,
 {
-    let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
     let process_item = |item: &mut I, buffer: &mut GpuArrayBuffer<F::BufferData>| {
@@ -337,13 +354,14 @@ pub fn batch_and_prepare_sorted_render_phase<I, F>(
     };
 
     // We only process CPU-built batch data in this function.
-    let BatchedInstanceBuffers::CpuBuilt(ref mut buffer) = gpu_array_buffer else {
+    let Some(cpu_batched_instance_buffers) = cpu_batched_instance_buffer else {
         return;
     };
+    let cpu_batched_instance_buffers = cpu_batched_instance_buffers.into_inner();
 
     for mut phase in &mut views {
         let items = phase.items.iter_mut().map(|item| {
-            let batch_data = process_item(item, buffer);
+            let batch_data = process_item(item, cpu_batched_instance_buffers);
             (item.batch_range_mut(), batch_data)
         });
         items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
@@ -360,23 +378,24 @@ pub fn batch_and_prepare_sorted_render_phase<I, F>(
 /// Batch the items in a sorted render phase, when GPU instance buffer building
 /// isn't in use. This means comparing metadata needed to draw each phase item
 /// and trying to combine the draws into a batch.
-pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, F>(
-    gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
+pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, GBID>(
+    gpu_batched_instance_buffers: Option<
+        ResMut<BatchedGpuBuiltInstanceBuffers<GBID::BufferData, GBID::BufferInputData>>,
+    >,
     mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
-    param: StaticSystemParam<F::Param>,
+    param: StaticSystemParam<GBID::Param>,
 ) where
     I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
-    F: GetBatchData,
+    GBID: GetBatchInputData,
 {
-    let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
     let process_item =
         |item: &mut I,
-         data_buffer: &mut UninitBufferVec<F::BufferData>,
+         data_buffer: &mut UninitBufferVec<GBID::BufferData>,
          work_item_buffer: &mut BufferVec<PreprocessWorkItem>| {
             let (input_index, compare_data) =
-                F::get_batch_input_index(&system_param_item, item.entity())?;
+                GBID::get_batch_input_index(&system_param_item, item.entity())?;
             let output_index = data_buffer.add() as u32;
 
             work_item_buffer.push(PreprocessWorkItem {
@@ -394,14 +413,14 @@ pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, F>(
         };
 
     // We only process GPU-built batch data in this function.
-    let BatchedInstanceBuffers::GpuBuilt {
+    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
+        return;
+    };
+    let BatchedGpuBuiltInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
         ..
-    } = gpu_array_buffer
-    else {
-        return;
-    };
+    } = gpu_batched_instance_buffers.into_inner();
 
     for (view, mut phase) in &mut views {
         // Create the work item buffer if necessary; otherwise, just mark it as
@@ -438,19 +457,18 @@ where
 
 /// Creates batches for a render phase that uses bins, when GPU batch data
 /// building isn't in use.
-pub fn batch_and_prepare_binned_render_phase<BPI, GBBD>(
-    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GBBD::BufferData, GBBD::BufferInputData>>,
+pub fn batch_and_prepare_binned_render_phase_no_gpu_preprocessing<BPI, GBBD>(
+    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GBBD::BufferData>>>,
     mut views: Query<&mut BinnedRenderPhase<BPI>>,
     param: StaticSystemParam<GBBD::Param>,
 ) where
     BPI: BinnedPhaseItem,
     GBBD: GetBinnedBatchData,
 {
-    let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
     // We only process CPU-built batch data in this function.
-    let BatchedInstanceBuffers::CpuBuilt(ref mut buffer) = gpu_array_buffer else {
+    let Some(mut buffer) = cpu_batched_instance_buffer else {
         return;
     };
 
@@ -506,26 +524,27 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GBBD>(
 }
 
 /// Creates batches for a render phase that uses bins.
-pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
-    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GBBD::BufferData, GBBD::BufferInputData>>,
+pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBDGP>(
+    gpu_batched_instance_buffers: Option<
+        ResMut<BatchedGpuBuiltInstanceBuffers<GBBDGP::BufferData, GBBDGP::BufferInputData>>,
+    >,
     mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
-    param: StaticSystemParam<GBBD::Param>,
+    param: StaticSystemParam<GBBDGP::Param>,
 ) where
     BPI: BinnedPhaseItem,
-    GBBD: GetBinnedBatchData,
+    GBBDGP: GetBinnedBatchInputData,
 {
-    let gpu_array_buffer = gpu_array_buffer.into_inner();
     let system_param_item = param.into_inner();
 
     // We only process GPU-built batch data in this function.
-    let BatchedInstanceBuffers::GpuBuilt {
+    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
+        return;
+    };
+    let BatchedGpuBuiltInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
         ..
-    } = gpu_array_buffer
-    else {
-        return;
-    };
+    } = gpu_batched_instance_buffers.into_inner();
 
     for (view, mut phase) in &mut views {
         let phase = &mut *phase; // Borrow checker.
@@ -541,7 +560,7 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
         for key in &phase.batchable_keys {
             let mut batch: Option<BinnedRenderPhaseBatch> = None;
             for &entity in &phase.batchable_values[key] {
-                let Some(input_index) = GBBD::get_batch_input_index(&system_param_item, entity)
+                let Some(input_index) = GBBDGP::get_batch_input_index(&system_param_item, entity)
                 else {
                     continue;
                 };
@@ -571,7 +590,7 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
         for key in &phase.unbatchable_keys {
             let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
             for &entity in &unbatchables.entities {
-                let Some(input_index) = GBBD::get_batch_input_index(&system_param_item, entity)
+                let Some(input_index) = GBBDGP::get_batch_input_index(&system_param_item, entity)
                 else {
                     continue;
                 };
@@ -584,7 +603,7 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
 
                 unbatchables
                     .buffer_indices
-                    .add(GpuArrayBufferIndex::<GBBD::BufferData> {
+                    .add(GpuArrayBufferIndex::<GBBDGP::BufferData> {
                         index: output_index,
                         dynamic_offset: None,
                         element_type: PhantomData,
@@ -594,30 +613,41 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBD>(
     }
 }
 
-pub fn write_batched_instance_buffer<F: GetBatchData>(
+pub fn write_cpu_built_batched_instance_buffers<GBD>(
     render_device: Res<RenderDevice>,
     render_queue: Res<RenderQueue>,
-    gpu_array_buffer: ResMut<BatchedInstanceBuffers<F::BufferData, F::BufferInputData>>,
-) {
-    let gpu_array_buffer = gpu_array_buffer.into_inner();
-    match gpu_array_buffer {
-        BatchedInstanceBuffers::CpuBuilt(ref mut gpu_array_buffer) => {
-            gpu_array_buffer.write_buffer(&render_device, &render_queue);
-        }
-        BatchedInstanceBuffers::GpuBuilt {
-            ref mut data_buffer,
-            work_item_buffers: ref mut index_buffers,
-            ref mut current_input_buffer,
-            previous_input_buffer: _,
-        } => {
-            data_buffer.write_buffer(&render_device);
-            current_input_buffer.write_buffer(&render_device, &render_queue);
-            // There's no need to write `previous_input_buffer`, as we wrote
-            // that on the previous frame, and it hasn't changed.
-
-            for index_buffer in index_buffers.values_mut() {
-                index_buffer.write_buffer(&render_device, &render_queue);
-            }
-        }
+    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GBD::BufferData>>>,
+) where
+    GBD: GetBatchData,
+{
+    if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
+        cpu_batched_instance_buffer.write_buffer(&render_device, &render_queue);
+    }
+}
+
+pub fn write_gpu_built_batched_instance_buffers<GBID>(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    gpu_batched_instance_buffers: Option<
+        ResMut<BatchedGpuBuiltInstanceBuffers<GBID::BufferData, GBID::BufferInputData>>,
+    >,
+) where
+    GBID: GetBatchInputData,
+{
+    let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
+        return;
+    };
+
+    gpu_batched_instance_buffers
+        .data_buffer
+        .write_buffer(&render_device);
+    gpu_batched_instance_buffers
+        .current_input_buffer
+        .write_buffer(&render_device, &render_queue);
+    // There's no need to write `previous_input_buffer`, as we wrote
+    // that on the previous frame, and it hasn't changed.
+
+    for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() {
+        work_item_buffer.write_buffer(&render_device, &render_queue);
     }
 }
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index 76d7bc579f189..eec70d73b0e1b 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -38,7 +38,9 @@ use nonmax::NonMaxU32;
 pub use rangefinder::*;
 
 use crate::{
-    batching::{self, GetBatchData, GetBinnedBatchData},
+    batching::{
+        self, GetBatchData, GetBatchInputData, GetBinnedBatchData, GetBinnedBatchInputData,
+    },
     render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache},
     Render, RenderApp, RenderSet,
 };
@@ -298,11 +300,24 @@ where
 
 /// A convenient abstraction for adding all the systems necessary for a binned
 /// render phase to the render app.
+///
+/// This is the version used when the pipeline doesn't support GPU
+/// preprocessing: e.g. 2D meshes.
 pub struct BinnedRenderPhasePlugin<BPI, GBBD>(PhantomData<(BPI, GBBD)>)
 where
     BPI: BinnedPhaseItem,
     GBBD: GetBinnedBatchData;
 
+/// A convenient abstraction for adding all the systems necessary for a binned
+/// render phase to the render app.
+///
+/// This is the version used when the pipeline supports GPU preprocessing: e.g.
+/// 3D PBR meshes.
+pub struct BinnedRenderPhaseGpuPreprocessingPlugin<BPI, GBBID>(PhantomData<(BPI, GBBID)>)
+where
+    BPI: BinnedPhaseItem,
+    GBBID: GetBinnedBatchInputData;
+
 impl<BPI, GBBD> Default for BinnedRenderPhasePlugin<BPI, GBBD>
 where
     BPI: BinnedPhaseItem,
@@ -313,6 +328,16 @@ where
     }
 }
 
+impl<BPI, GBBID> Default for BinnedRenderPhaseGpuPreprocessingPlugin<BPI, GBBID>
+where
+    BPI: BinnedPhaseItem,
+    GBBID: GetBinnedBatchInputData,
+{
+    fn default() -> Self {
+        Self(PhantomData)
+    }
+}
+
 impl<BPI, GBBD> Plugin for BinnedRenderPhasePlugin<BPI, GBBD>
 where
     BPI: BinnedPhaseItem,
@@ -323,15 +348,39 @@ where
             return;
         };
 
+        render_app.add_systems(
+            Render,
+            (
+                batching::sort_binned_render_phase::<BPI>.in_set(RenderSet::PhaseSort),
+                batching::batch_and_prepare_binned_render_phase_no_gpu_preprocessing::<BPI, GBBD>
+                    .in_set(RenderSet::PrepareResources),
+            ),
+        );
+    }
+}
+
+impl<BPI, GBBID> Plugin for BinnedRenderPhaseGpuPreprocessingPlugin<BPI, GBBID>
+where
+    BPI: BinnedPhaseItem,
+    GBBID: GetBinnedBatchInputData + Sync + Send + 'static,
+{
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
         render_app.add_systems(
             Render,
             (
                 batching::sort_binned_render_phase::<BPI>.in_set(RenderSet::PhaseSort),
                 (
-                    batching::batch_and_prepare_binned_render_phase::<BPI, GBBD>,
+                    batching::batch_and_prepare_binned_render_phase_no_gpu_preprocessing::<
+                        BPI,
+                        GBBID,
+                    >,
                     batching::batch_and_prepare_binned_render_phase_for_gpu_preprocessing::<
                         BPI,
-                        GBBD,
+                        GBBID,
                     >,
                 )
                     .in_set(RenderSet::PrepareResources),
@@ -342,11 +391,24 @@ where
 
 /// A convenient abstraction for adding all the systems necessary for a sorted
 /// render phase to the render app.
+///
+/// This is the version used when the pipeline doesn't support GPU
+/// preprocessing: e.g. 2D sprites.
 pub struct SortedRenderPhasePlugin<SPI, GBD>(PhantomData<(SPI, GBD)>)
 where
     SPI: SortedPhaseItem,
     GBD: GetBatchData;
 
+/// A convenient abstraction for adding all the systems necessary for a sorted
+/// render phase to the render app.
+///
+/// This is the version used when the pipeline supports GPU preprocessing: e.g.
+/// 3D PBR meshes.
+pub struct SortedRenderPhaseGpuPreprocessingPlugin<SPI, GBID>(PhantomData<(SPI, GBID)>)
+where
+    SPI: SortedPhaseItem,
+    GBID: GetBatchInputData;
+
 impl<SPI, GBD> Default for SortedRenderPhasePlugin<SPI, GBD>
 where
     SPI: SortedPhaseItem,
@@ -357,6 +419,16 @@ where
     }
 }
 
+impl<SPI, GBID> Default for SortedRenderPhaseGpuPreprocessingPlugin<SPI, GBID>
+where
+    SPI: SortedPhaseItem,
+    GBID: GetBatchInputData,
+{
+    fn default() -> Self {
+        Self(PhantomData)
+    }
+}
+
 impl<SPI, GBD> Plugin for SortedRenderPhasePlugin<SPI, GBD>
 where
     SPI: SortedPhaseItem + CachedRenderPipelinePhaseItem,
@@ -367,11 +439,29 @@ where
             return;
         };
 
+        render_app.add_systems(
+            Render,
+            batching::batch_and_prepare_sorted_render_phase_no_gpu_preprocessing::<SPI, GBD>
+                .in_set(RenderSet::PrepareResources),
+        );
+    }
+}
+
+impl<SPI, GBID> Plugin for SortedRenderPhaseGpuPreprocessingPlugin<SPI, GBID>
+where
+    SPI: SortedPhaseItem + CachedRenderPipelinePhaseItem,
+    GBID: GetBatchInputData + Sync + Send + 'static,
+{
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
         render_app.add_systems(
             Render,
             (
-                batching::batch_and_prepare_sorted_render_phase::<SPI, GBD>,
-                batching::batch_and_prepare_sorted_render_phase_for_gpu_preprocessing::<SPI, GBD>,
+                batching::batch_and_prepare_sorted_render_phase_no_gpu_preprocessing::<SPI, GBID>,
+                batching::batch_and_prepare_sorted_render_phase_for_gpu_preprocessing::<SPI, GBID>,
             )
                 .in_set(RenderSet::PrepareResources),
         );
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index a3fd0aa98694b..b2aee9a30ad2b 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -11,11 +11,13 @@ use bevy_ecs::{
 };
 use bevy_math::{Affine3, Vec4};
 use bevy_reflect::{std_traits::ReflectDefault, Reflect};
-use bevy_render::batching::BatchedInstanceBuffers;
+use bevy_render::batching::{
+    write_cpu_built_batched_instance_buffers, BatchedCpuBuiltInstanceBuffer,
+};
 use bevy_render::mesh::MeshVertexBufferLayoutRef;
 use bevy_render::{
     batching::{
-        batch_and_prepare_sorted_render_phase, write_batched_instance_buffer, GetBatchData,
+        batch_and_prepare_sorted_render_phase_no_gpu_preprocessing, GetBatchData,
         NoAutomaticBatching,
     },
     globals::{GlobalsBuffer, GlobalsUniform},
@@ -33,7 +35,6 @@ use bevy_render::{
     Extract, ExtractSchedule, Render, RenderApp, RenderSet,
 };
 use bevy_transform::components::GlobalTransform;
-use bevy_utils::tracing::error;
 
 use crate::Material2dBindGroupId;
 
@@ -103,9 +104,12 @@ impl Plugin for Mesh2dRenderPlugin {
                 .add_systems(
                     Render,
                     (
-                        batch_and_prepare_sorted_render_phase::<Transparent2d, Mesh2dPipeline>
+                        batch_and_prepare_sorted_render_phase_no_gpu_preprocessing::<
+                            Transparent2d,
+                            Mesh2dPipeline,
+                        >
                             .in_set(RenderSet::PrepareResources),
-                        write_batched_instance_buffer::<Mesh2dPipeline>
+                        write_cpu_built_batched_instance_buffers::<Mesh2dPipeline>
                             .in_set(RenderSet::PrepareResourcesFlush),
                         prepare_mesh2d_bind_group.in_set(RenderSet::PrepareBindGroups),
                         prepare_mesh2d_view_bind_groups.in_set(RenderSet::PrepareBindGroups),
@@ -119,10 +123,8 @@ impl Plugin for Mesh2dRenderPlugin {
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
             let render_device = render_app.world().resource::<RenderDevice>();
-            let batched_instance_buffers = BatchedInstanceBuffers::<Mesh2dUniform, ()>::new(
-                render_device,
-                /*using_gpu_uniform_building=*/ false,
-            );
+            let batched_instance_buffer =
+                BatchedCpuBuiltInstanceBuffer::<Mesh2dUniform>::new(render_device);
 
             if let Some(per_object_buffer_batch_size) =
                 GpuArrayBuffer::<Mesh2dUniform>::batch_size(render_device)
@@ -134,7 +136,7 @@ impl Plugin for Mesh2dRenderPlugin {
             }
 
             render_app
-                .insert_resource(batched_instance_buffers)
+                .insert_resource(batched_instance_buffer)
                 .init_resource::<Mesh2dPipeline>();
         }
 
@@ -349,7 +351,6 @@ impl GetBatchData for Mesh2dPipeline {
     type Param = SRes<RenderMesh2dInstances>;
     type CompareData = (Material2dBindGroupId, AssetId<Mesh>);
     type BufferData = Mesh2dUniform;
-    type BufferInputData = ();
 
     fn get_batch_data(
         mesh_instances: &SystemParamItem<Self::Param>,
@@ -364,14 +365,6 @@ impl GetBatchData for Mesh2dPipeline {
             )),
         ))
     }
-
-    fn get_batch_input_index(
-        _: &SystemParamItem<Self::Param>,
-        _: Entity,
-    ) -> Option<(u32, Option<Self::CompareData>)> {
-        error!("Attempted to build 2D mesh uniforms on GPU, which is currently unsupported");
-        None
-    }
 }
 
 bitflags::bitflags! {
@@ -587,7 +580,7 @@ pub fn prepare_mesh2d_bind_group(
     mut commands: Commands,
     mesh2d_pipeline: Res<Mesh2dPipeline>,
     render_device: Res<RenderDevice>,
-    mesh2d_uniforms: Res<BatchedInstanceBuffers<Mesh2dUniform, ()>>,
+    mesh2d_uniforms: Res<BatchedCpuBuiltInstanceBuffer<Mesh2dUniform>>,
 ) {
     if let Some(binding) = mesh2d_uniforms.instance_data_binding() {
         commands.insert_resource(Mesh2dBindGroup {

From 1cf5e91026c6fe9b38f619c506cae239a02fa18e Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Fri, 5 Apr 2024 01:04:42 -0700
Subject: [PATCH 21/39] Reduce the number of `GetBatchData` traits down to two.

Now we just have `GetBatchData`, used for 2D sprites, and
`GetFullBatchData`, used for 3D PBR meshes.
---
 crates/bevy_pbr/src/render/mesh.rs         |  36 ++----
 crates/bevy_render/src/batching/mod.rs     | 137 ++++++++++-----------
 crates/bevy_render/src/render_phase/mod.rs | 115 +++--------------
 3 files changed, 93 insertions(+), 195 deletions(-)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index c28ecefc1b2ee..407b52d1eb157 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -18,13 +18,13 @@ use bevy_render::{
         clear_batched_instance_buffers, delete_old_work_item_buffers,
         write_cpu_built_batched_instance_buffers, write_gpu_built_batched_instance_buffers,
         BatchedCpuBuiltInstanceBuffer, BatchedGpuBuiltInstanceBuffers, GetBatchData,
-        GetBatchInputData, GetBinnedBatchData, GetBinnedBatchInputData, NoAutomaticBatching,
+        GetFullBatchData, NoAutomaticBatching,
     },
     mesh::*,
     render_asset::RenderAssets,
     render_phase::{
-        BinnedRenderPhaseGpuPreprocessingPlugin, PhaseItem, RenderCommand, RenderCommandResult,
-        SortedRenderPhaseGpuPreprocessingPlugin, TrackedRenderPass,
+        BinnedRenderPhasePlugin, PhaseItem, RenderCommand, RenderCommandResult,
+        SortedRenderPhasePlugin, TrackedRenderPass,
     },
     render_resource::*,
     renderer::{RenderDevice, RenderQueue},
@@ -130,13 +130,13 @@ impl Plugin for MeshRenderPlugin {
             (no_automatic_skin_batching, no_automatic_morph_batching),
         )
         .add_plugins((
-            BinnedRenderPhaseGpuPreprocessingPlugin::<Opaque3d, MeshPipeline>::default(),
-            BinnedRenderPhaseGpuPreprocessingPlugin::<AlphaMask3d, MeshPipeline>::default(),
-            BinnedRenderPhaseGpuPreprocessingPlugin::<Shadow, MeshPipeline>::default(),
-            BinnedRenderPhaseGpuPreprocessingPlugin::<Opaque3dDeferred, MeshPipeline>::default(),
-            BinnedRenderPhaseGpuPreprocessingPlugin::<AlphaMask3dDeferred, MeshPipeline>::default(),
-            SortedRenderPhaseGpuPreprocessingPlugin::<Transmissive3d, MeshPipeline>::default(),
-            SortedRenderPhaseGpuPreprocessingPlugin::<Transparent3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::default(),
+            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::default(),
+            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::default(),
         ));
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
@@ -907,7 +907,7 @@ impl GetBatchData for MeshPipeline {
     }
 }
 
-impl GetBatchInputData for MeshPipeline {
+impl GetFullBatchData for MeshPipeline {
     type BufferInputData = MeshInputUniform;
 
     fn get_batch_input_index(
@@ -935,14 +935,8 @@ impl GetBatchInputData for MeshPipeline {
             )),
         ))
     }
-}
-
-impl GetBinnedBatchData for MeshPipeline {
-    type Param = (SRes<RenderMeshInstances>, SRes<RenderLightmaps>);
-
-    type BufferData = MeshUniform;
 
-    fn get_batch_data(
+    fn get_binned_batch_data(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<Self::BufferData> {
@@ -961,12 +955,8 @@ impl GetBinnedBatchData for MeshPipeline {
             maybe_lightmap.map(|lightmap| lightmap.uv_rect),
         ))
     }
-}
 
-impl GetBinnedBatchInputData for MeshPipeline {
-    type BufferInputData = MeshInputUniform;
-
-    fn get_batch_input_index(
+    fn get_binned_batch_input_index(
         (mesh_instances, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<u32> {
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 4c66f04bfec6b..335cd68c753c8 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -187,6 +187,10 @@ where
 
 /// A trait to support getting data used for batching draw commands via phase
 /// items.
+///
+/// This is a simple version that only allows for sorting, not binning, as well
+/// as only CPU processing, not GPU preprocessing. For these fancier features,
+/// see [`GetFullBatchData`].
 pub trait GetBatchData {
     /// The system parameters [`GetBatchData::get_batch_data`] needs in
     /// order to compute the batch data.
@@ -215,12 +219,23 @@ pub trait GetBatchData {
 /// A trait to support getting data used for batching draw commands via phase
 /// items.
 ///
-/// This is only used when GPU preprocessing is in use.
-pub trait GetBatchInputData: GetBatchData {
+/// This version allows for binning and GPU preprocessing.
+pub trait GetFullBatchData: GetBatchData {
     /// The per-instance data that was inserted into the [`BufferVec`] during
     /// extraction.
     type BufferInputData: Pod + Sync + Send;
 
+    /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
+    ///
+    /// This is only called when building uniforms on CPU. In the GPU instance
+    /// buffer building path, we use
+    /// [`GetBinnedBatchData::get_batch_input_index`]
+    /// instead.
+    fn get_binned_batch_data(
+        param: &SystemParamItem<Self::Param>,
+        query_item: Entity,
+    ) -> Option<Self::BufferData>;
+
     /// Returns the index of the [`GetBatchData::BufferInputData`] that the GPU
     /// preprocessing phase will use.
     ///
@@ -232,45 +247,15 @@ pub trait GetBatchInputData: GetBatchData {
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(u32, Option<Self::CompareData>)>;
-}
 
-/// When implemented on a pipeline, this trait allows the batching logic to
-/// compute the per-batch data that will be uploaded to the GPU.
-///
-/// This includes things like the mesh transforms.
-pub trait GetBinnedBatchData {
-    /// The system parameters [`GetBinnedBatchData::get_batch_data`] needs
-    /// in order to compute the batch data.
-    type Param: SystemParam + 'static;
-    /// The per-instance data to be inserted into the [`GpuArrayBuffer`]
-    /// containing these data for all instances.
-    type BufferData: GpuArrayBufferable + Sync + Send + 'static;
-
-    /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
-    ///
-    /// This is only called when building uniforms on CPU. In the GPU instance
-    /// buffer building path, we use
-    /// [`GetBinnedBatchDataForGpuPreprocessing::get_batch_input_index`]
-    /// instead.
-    fn get_batch_data(
-        param: &SystemParamItem<Self::Param>,
-        entity: Entity,
-    ) -> Option<Self::BufferData>;
-}
-
-/// Like [`GetBinnedBatchData`], but for GPU preprocessing.
-pub trait GetBinnedBatchInputData: GetBinnedBatchData {
-    /// The per-instance data that was inserted into the [`BufferVec`] during
-    /// extraction.
-    type BufferInputData: Pod + Sync + Send;
-
-    /// Returns the index of the [`GetBinnedBatchData::BufferInputData`] that
-    /// the GPU preprocessing phase will use.
+    /// Returns the index of the [`GetBatchData::BufferInputData`] that the GPU
+    /// preprocessing phase will use, for the binning path.
     ///
-    /// We already inserted the [`GetBinnedBatchData::BufferInputData`] during
-    /// the extraction phase before we got here, so this function shouldn't need
-    /// to look up any render data.
-    fn get_batch_input_index(
+    /// We already inserted the [`GetBatchData::BufferInputData`] during the
+    /// extraction phase before we got here, so this function shouldn't need to
+    /// look up any render data. If CPU instance buffer building is in use, this
+    /// function will never be called.
+    fn get_binned_batch_input_index(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<u32>;
@@ -282,13 +267,13 @@ pub trait GetBinnedBatchInputData: GetBinnedBatchData {
 /// We have to run this during extraction because, if GPU preprocessing is in
 /// use, the extraction phase will write to the mesh input uniform buffers
 /// directly, so the buffers need to be cleared before then.
-pub fn clear_batched_instance_buffers<GBID>(
-    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GBID::BufferData>>>,
+pub fn clear_batched_instance_buffers<GFBD>(
+    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GFBD::BufferData>>>,
     gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GBID::BufferData, GBID::BufferInputData>>,
+        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
     >,
 ) where
-    GBID: GetBatchInputData,
+    GFBD: GetFullBatchData,
 {
     if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
         cpu_batched_instance_buffer.clear();
@@ -310,13 +295,13 @@ pub fn clear_batched_instance_buffers<GBID>(
 /// This is a separate system from [`clear_batched_instance_buffers`] because
 /// [`ViewTarget`]s aren't created until after the extraction phase is
 /// completed.
-pub fn delete_old_work_item_buffers<GBID>(
+pub fn delete_old_work_item_buffers<GFBD>(
     gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GBID::BufferData, GBID::BufferInputData>>,
+        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
     >,
     view_targets: Query<Entity, With<ViewTarget>>,
 ) where
-    GBID: GetBatchInputData,
+    GFBD: GetFullBatchData,
 {
     if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
         gpu_batched_instance_buffers
@@ -328,18 +313,18 @@ pub fn delete_old_work_item_buffers<GBID>(
 /// Batch the items in a sorted render phase, when GPU instance buffer building
 /// isn't in use. This means comparing metadata needed to draw each phase item
 /// and trying to combine the draws into a batch.
-pub fn batch_and_prepare_sorted_render_phase_no_gpu_preprocessing<I, F>(
-    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<F::BufferData>>>,
+pub fn batch_and_prepare_sorted_render_phase_no_gpu_preprocessing<I, GBD>(
+    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GBD::BufferData>>>,
     mut views: Query<&mut SortedRenderPhase<I>>,
-    param: StaticSystemParam<F::Param>,
+    param: StaticSystemParam<GBD::Param>,
 ) where
     I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
-    F: GetBatchData,
+    GBD: GetBatchData,
 {
     let system_param_item = param.into_inner();
 
-    let process_item = |item: &mut I, buffer: &mut GpuArrayBuffer<F::BufferData>| {
-        let (buffer_data, compare_data) = F::get_batch_data(&system_param_item, item.entity())?;
+    let process_item = |item: &mut I, buffer: &mut GpuArrayBuffer<GBD::BufferData>| {
+        let (buffer_data, compare_data) = GBD::get_batch_data(&system_param_item, item.entity())?;
         let buffer_index = buffer.push(buffer_data);
 
         let index = buffer_index.index;
@@ -378,24 +363,24 @@ pub fn batch_and_prepare_sorted_render_phase_no_gpu_preprocessing<I, F>(
 /// Batch the items in a sorted render phase, when GPU instance buffer building
 /// isn't in use. This means comparing metadata needed to draw each phase item
 /// and trying to combine the draws into a batch.
-pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, GBID>(
+pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, GFBD>(
     gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GBID::BufferData, GBID::BufferInputData>>,
+        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
     >,
     mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
-    param: StaticSystemParam<GBID::Param>,
+    param: StaticSystemParam<GFBD::Param>,
 ) where
     I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
-    GBID: GetBatchInputData,
+    GFBD: GetFullBatchData,
 {
     let system_param_item = param.into_inner();
 
     let process_item =
         |item: &mut I,
-         data_buffer: &mut UninitBufferVec<GBID::BufferData>,
+         data_buffer: &mut UninitBufferVec<GFBD::BufferData>,
          work_item_buffer: &mut BufferVec<PreprocessWorkItem>| {
             let (input_index, compare_data) =
-                GBID::get_batch_input_index(&system_param_item, item.entity())?;
+                GFBD::get_batch_input_index(&system_param_item, item.entity())?;
             let output_index = data_buffer.add() as u32;
 
             work_item_buffer.push(PreprocessWorkItem {
@@ -457,13 +442,13 @@ where
 
 /// Creates batches for a render phase that uses bins, when GPU batch data
 /// building isn't in use.
-pub fn batch_and_prepare_binned_render_phase_no_gpu_preprocessing<BPI, GBBD>(
-    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GBBD::BufferData>>>,
+pub fn batch_and_prepare_binned_render_phase_no_gpu_preprocessing<BPI, GFBD>(
+    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GFBD::BufferData>>>,
     mut views: Query<&mut BinnedRenderPhase<BPI>>,
-    param: StaticSystemParam<GBBD::Param>,
+    param: StaticSystemParam<GFBD::Param>,
 ) where
     BPI: BinnedPhaseItem,
-    GBBD: GetBinnedBatchData,
+    GFBD: GetFullBatchData,
 {
     let system_param_item = param.into_inner();
 
@@ -480,7 +465,8 @@ pub fn batch_and_prepare_binned_render_phase_no_gpu_preprocessing<BPI, GBBD>(
         for key in &phase.batchable_keys {
             let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![];
             for &entity in &phase.batchable_values[key] {
-                let Some(buffer_data) = GBBD::get_batch_data(&system_param_item, entity) else {
+                let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity)
+                else {
                     continue;
                 };
                 let instance = buffer.push(buffer_data);
@@ -513,7 +499,8 @@ pub fn batch_and_prepare_binned_render_phase_no_gpu_preprocessing<BPI, GBBD>(
         for key in &phase.unbatchable_keys {
             let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
             for &entity in &unbatchables.entities {
-                let Some(buffer_data) = GBBD::get_batch_data(&system_param_item, entity) else {
+                let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity)
+                else {
                     continue;
                 };
                 let instance = buffer.push(buffer_data);
@@ -524,15 +511,15 @@ pub fn batch_and_prepare_binned_render_phase_no_gpu_preprocessing<BPI, GBBD>(
 }
 
 /// Creates batches for a render phase that uses bins.
-pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBDGP>(
+pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GFBD>(
     gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GBBDGP::BufferData, GBBDGP::BufferInputData>>,
+        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
     >,
     mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
-    param: StaticSystemParam<GBBDGP::Param>,
+    param: StaticSystemParam<GFBD::Param>,
 ) where
     BPI: BinnedPhaseItem,
-    GBBDGP: GetBinnedBatchInputData,
+    GFBD: GetFullBatchData,
 {
     let system_param_item = param.into_inner();
 
@@ -560,7 +547,8 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBDGP>(
         for key in &phase.batchable_keys {
             let mut batch: Option<BinnedRenderPhaseBatch> = None;
             for &entity in &phase.batchable_values[key] {
-                let Some(input_index) = GBBDGP::get_batch_input_index(&system_param_item, entity)
+                let Some(input_index) =
+                    GFBD::get_binned_batch_input_index(&system_param_item, entity)
                 else {
                     continue;
                 };
@@ -590,7 +578,8 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBDGP>(
         for key in &phase.unbatchable_keys {
             let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
             for &entity in &unbatchables.entities {
-                let Some(input_index) = GBBDGP::get_batch_input_index(&system_param_item, entity)
+                let Some(input_index) =
+                    GFBD::get_binned_batch_input_index(&system_param_item, entity)
                 else {
                     continue;
                 };
@@ -603,7 +592,7 @@ pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GBBDGP>(
 
                 unbatchables
                     .buffer_indices
-                    .add(GpuArrayBufferIndex::<GBBDGP::BufferData> {
+                    .add(GpuArrayBufferIndex::<GFBD::BufferData> {
                         index: output_index,
                         dynamic_offset: None,
                         element_type: PhantomData,
@@ -625,14 +614,14 @@ pub fn write_cpu_built_batched_instance_buffers<GBD>(
     }
 }
 
-pub fn write_gpu_built_batched_instance_buffers<GBID>(
+pub fn write_gpu_built_batched_instance_buffers<GFBD>(
     render_device: Res<RenderDevice>,
     render_queue: Res<RenderQueue>,
     gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GBID::BufferData, GBID::BufferInputData>>,
+        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
     >,
 ) where
-    GBID: GetBatchInputData,
+    GFBD: GetFullBatchData,
 {
     let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
         return;
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index eec70d73b0e1b..c22dc9c55380f 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -38,9 +38,7 @@ use nonmax::NonMaxU32;
 pub use rangefinder::*;
 
 use crate::{
-    batching::{
-        self, GetBatchData, GetBatchInputData, GetBinnedBatchData, GetBinnedBatchInputData,
-    },
+    batching::{self, GetFullBatchData},
     render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache},
     Render, RenderApp, RenderSet,
 };
@@ -298,71 +296,30 @@ where
     }
 }
 
-/// A convenient abstraction for adding all the systems necessary for a binned
-/// render phase to the render app.
-///
-/// This is the version used when the pipeline doesn't support GPU
-/// preprocessing: e.g. 2D meshes.
-pub struct BinnedRenderPhasePlugin<BPI, GBBD>(PhantomData<(BPI, GBBD)>)
-where
-    BPI: BinnedPhaseItem,
-    GBBD: GetBinnedBatchData;
-
 /// A convenient abstraction for adding all the systems necessary for a binned
 /// render phase to the render app.
 ///
 /// This is the version used when the pipeline supports GPU preprocessing: e.g.
 /// 3D PBR meshes.
-pub struct BinnedRenderPhaseGpuPreprocessingPlugin<BPI, GBBID>(PhantomData<(BPI, GBBID)>)
-where
-    BPI: BinnedPhaseItem,
-    GBBID: GetBinnedBatchInputData;
-
-impl<BPI, GBBD> Default for BinnedRenderPhasePlugin<BPI, GBBD>
+pub struct BinnedRenderPhasePlugin<BPI, GFBD>(PhantomData<(BPI, GFBD)>)
 where
     BPI: BinnedPhaseItem,
-    GBBD: GetBinnedBatchData,
-{
-    fn default() -> Self {
-        Self(PhantomData)
-    }
-}
+    GFBD: GetFullBatchData;
 
-impl<BPI, GBBID> Default for BinnedRenderPhaseGpuPreprocessingPlugin<BPI, GBBID>
+impl<BPI, GFBD> Default for BinnedRenderPhasePlugin<BPI, GFBD>
 where
     BPI: BinnedPhaseItem,
-    GBBID: GetBinnedBatchInputData,
+    GFBD: GetFullBatchData,
 {
     fn default() -> Self {
         Self(PhantomData)
     }
 }
 
-impl<BPI, GBBD> Plugin for BinnedRenderPhasePlugin<BPI, GBBD>
-where
-    BPI: BinnedPhaseItem,
-    GBBD: GetBinnedBatchData + Sync + Send + 'static,
-{
-    fn build(&self, app: &mut App) {
-        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
-            return;
-        };
-
-        render_app.add_systems(
-            Render,
-            (
-                batching::sort_binned_render_phase::<BPI>.in_set(RenderSet::PhaseSort),
-                batching::batch_and_prepare_binned_render_phase_no_gpu_preprocessing::<BPI, GBBD>
-                    .in_set(RenderSet::PrepareResources),
-            ),
-        );
-    }
-}
-
-impl<BPI, GBBID> Plugin for BinnedRenderPhaseGpuPreprocessingPlugin<BPI, GBBID>
+impl<BPI, GFBD> Plugin for BinnedRenderPhasePlugin<BPI, GFBD>
 where
     BPI: BinnedPhaseItem,
-    GBBID: GetBinnedBatchInputData + Sync + Send + 'static,
+    GFBD: GetFullBatchData + Sync + Send + 'static,
 {
     fn build(&self, app: &mut App) {
         let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
@@ -376,11 +333,11 @@ where
                 (
                     batching::batch_and_prepare_binned_render_phase_no_gpu_preprocessing::<
                         BPI,
-                        GBBID,
+                        GFBD,
                     >,
                     batching::batch_and_prepare_binned_render_phase_for_gpu_preprocessing::<
                         BPI,
-                        GBBID,
+                        GFBD,
                     >,
                 )
                     .in_set(RenderSet::PrepareResources),
@@ -389,68 +346,30 @@ where
     }
 }
 
-/// A convenient abstraction for adding all the systems necessary for a sorted
-/// render phase to the render app.
-///
-/// This is the version used when the pipeline doesn't support GPU
-/// preprocessing: e.g. 2D sprites.
-pub struct SortedRenderPhasePlugin<SPI, GBD>(PhantomData<(SPI, GBD)>)
-where
-    SPI: SortedPhaseItem,
-    GBD: GetBatchData;
-
 /// A convenient abstraction for adding all the systems necessary for a sorted
 /// render phase to the render app.
 ///
 /// This is the version used when the pipeline supports GPU preprocessing: e.g.
 /// 3D PBR meshes.
-pub struct SortedRenderPhaseGpuPreprocessingPlugin<SPI, GBID>(PhantomData<(SPI, GBID)>)
+pub struct SortedRenderPhasePlugin<SPI, GFBD>(PhantomData<(SPI, GFBD)>)
 where
     SPI: SortedPhaseItem,
-    GBID: GetBatchInputData;
-
-impl<SPI, GBD> Default for SortedRenderPhasePlugin<SPI, GBD>
-where
-    SPI: SortedPhaseItem,
-    GBD: GetBatchData,
-{
-    fn default() -> Self {
-        Self(PhantomData)
-    }
-}
+    GFBD: GetFullBatchData;
 
-impl<SPI, GBID> Default for SortedRenderPhaseGpuPreprocessingPlugin<SPI, GBID>
+impl<SPI, GFBD> Default for SortedRenderPhasePlugin<SPI, GFBD>
 where
     SPI: SortedPhaseItem,
-    GBID: GetBatchInputData,
+    GFBD: GetFullBatchData,
 {
     fn default() -> Self {
         Self(PhantomData)
     }
 }
 
-impl<SPI, GBD> Plugin for SortedRenderPhasePlugin<SPI, GBD>
-where
-    SPI: SortedPhaseItem + CachedRenderPipelinePhaseItem,
-    GBD: GetBatchData + Sync + Send + 'static,
-{
-    fn build(&self, app: &mut App) {
-        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
-            return;
-        };
-
-        render_app.add_systems(
-            Render,
-            batching::batch_and_prepare_sorted_render_phase_no_gpu_preprocessing::<SPI, GBD>
-                .in_set(RenderSet::PrepareResources),
-        );
-    }
-}
-
-impl<SPI, GBID> Plugin for SortedRenderPhaseGpuPreprocessingPlugin<SPI, GBID>
+impl<SPI, GFBD> Plugin for SortedRenderPhasePlugin<SPI, GFBD>
 where
     SPI: SortedPhaseItem + CachedRenderPipelinePhaseItem,
-    GBID: GetBatchInputData + Sync + Send + 'static,
+    GFBD: GetFullBatchData + Sync + Send + 'static,
 {
     fn build(&self, app: &mut App) {
         let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
@@ -460,8 +379,8 @@ where
         render_app.add_systems(
             Render,
             (
-                batching::batch_and_prepare_sorted_render_phase_no_gpu_preprocessing::<SPI, GBID>,
-                batching::batch_and_prepare_sorted_render_phase_for_gpu_preprocessing::<SPI, GBID>,
+                batching::batch_and_prepare_sorted_render_phase_no_gpu_preprocessing::<SPI, GFBD>,
+                batching::batch_and_prepare_sorted_render_phase_for_gpu_preprocessing::<SPI, GFBD>,
             )
                 .in_set(RenderSet::PrepareResources),
         );

From 18a5889fd2ea2d79271055f9aeb7420a143826aa Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Fri, 5 Apr 2024 13:14:19 -0700
Subject: [PATCH 22/39] Move the GPU-preprocessing and non-GPU-preprocessing
 functionality into different modules.

This allows us to shorten the names.
---
 crates/bevy_pbr/src/render/gpu_preprocess.rs  |  16 +-
 crates/bevy_pbr/src/render/mesh.rs            |  32 +-
 .../src/batching/gpu_preprocessing.rs         | 326 ++++++++++++
 crates/bevy_render/src/batching/mod.rs        | 489 +-----------------
 .../src/batching/no_gpu_preprocessing.rs      | 177 +++++++
 crates/bevy_render/src/render_phase/mod.rs    |  16 +-
 crates/bevy_sprite/src/mesh2d/mesh.rs         |  20 +-
 7 files changed, 549 insertions(+), 527 deletions(-)
 create mode 100644 crates/bevy_render/src/batching/gpu_preprocessing.rs
 create mode 100644 crates/bevy_render/src/batching/no_gpu_preprocessing.rs

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index beadf8793b0a2..a8912e4d37aed 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -20,7 +20,7 @@ use bevy_ecs::{
     world::{FromWorld, World},
 };
 use bevy_render::{
-    batching::{BatchedGpuBuiltInstanceBuffers, PreprocessWorkItem},
+    batching::gpu_preprocessing::{BatchedInstanceBuffers, PreprocessWorkItem},
     render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
     render_resource::{
         binding_types::{storage_buffer, storage_buffer_read_only},
@@ -128,10 +128,10 @@ impl Node for GpuPreprocessNode {
     ) -> Result<(), NodeRunError> {
         // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh
         // uniform building, bail out.
-        let Some(BatchedGpuBuiltInstanceBuffers {
+        let Some(BatchedInstanceBuffers {
             work_item_buffers: ref index_buffers,
             ..
-        }) = world.get_resource::<BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>>()
+        }) = world.get_resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
         else {
             error!(
                 "Attempted to preprocess meshes on GPU, but `GpuBuilt` batched instance buffers \
@@ -244,22 +244,20 @@ pub fn prepare_preprocess_pipeline(
 pub fn prepare_preprocess_bind_groups(
     mut commands: Commands,
     render_device: Res<RenderDevice>,
-    gpu_batched_instance_buffers: Option<
-        Res<BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>>,
-    >,
+    batched_instance_buffers: Option<Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>>,
     pipeline: Res<PreprocessPipeline>,
 ) {
     // Grab the [`BatchedGpuBuiltInstanceBuffers`]. If we aren't using GPU mesh
     // uniform building, bail out.
-    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
+    let Some(batched_instance_buffers) = batched_instance_buffers else {
         return;
     };
-    let BatchedGpuBuiltInstanceBuffers {
+    let BatchedInstanceBuffers {
         data_buffer: ref data_buffer_vec,
         work_item_buffers: ref index_buffers,
         current_input_buffer: ref current_input_buffer_vec,
         previous_input_buffer: ref previous_input_buffer_vec,
-    } = gpu_batched_instance_buffers.into_inner();
+    } = batched_instance_buffers.into_inner();
 
     let (Some(current_input_buffer), Some(previous_input_buffer), Some(data_buffer)) = (
         current_input_buffer_vec.buffer(),
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 407b52d1eb157..f3a9d6ecf2abe 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -15,9 +15,7 @@ use bevy_ecs::{
 use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
     batching::{
-        clear_batched_instance_buffers, delete_old_work_item_buffers,
-        write_cpu_built_batched_instance_buffers, write_gpu_built_batched_instance_buffers,
-        BatchedCpuBuiltInstanceBuffer, BatchedGpuBuiltInstanceBuffers, GetBatchData,
+        clear_batched_instance_buffers, gpu_preprocessing, no_gpu_preprocessing, GetBatchData,
         GetFullBatchData, NoAutomaticBatching,
     },
     mesh::*,
@@ -158,12 +156,12 @@ impl Plugin for MeshRenderPlugin {
                 .add_systems(
                     Render,
                     (
-                        delete_old_work_item_buffers::<MeshPipeline>
+                        gpu_preprocessing::delete_old_work_item_buffers::<MeshPipeline>
                             .in_set(RenderSet::ManageViews)
                             .after(prepare_view_targets),
-                        write_cpu_built_batched_instance_buffers::<MeshPipeline>
+                        no_gpu_preprocessing::write_batched_instance_buffer::<MeshPipeline>
                             .in_set(RenderSet::PrepareResourcesFlush),
-                        write_gpu_built_batched_instance_buffers::<MeshPipeline>
+                        gpu_preprocessing::write_batched_instance_buffers::<MeshPipeline>
                             .in_set(RenderSet::PrepareResourcesFlush),
                         prepare_skins.in_set(RenderSet::PrepareResources),
                         prepare_morphs.in_set(RenderSet::PrepareResources),
@@ -192,12 +190,12 @@ impl Plugin for MeshRenderPlugin {
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
             if self.use_gpu_instance_buffer_builder {
                 render_app
-                    .init_resource::<BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>>(
+                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>(
                     );
             } else {
                 let render_device = render_app.world().resource::<RenderDevice>();
                 let cpu_batched_instance_buffer =
-                    BatchedCpuBuiltInstanceBuffer::<MeshUniform>::new(render_device);
+                    no_gpu_preprocessing::BatchedInstanceBuffer::<MeshUniform>::new(render_device);
                 render_app.insert_resource(cpu_batched_instance_buffer);
             };
 
@@ -615,8 +613,8 @@ pub fn extract_meshes_for_cpu_building(
 /// [`MeshUniform`] building.
 pub fn extract_meshes_for_gpu_building(
     mut render_mesh_instances: ResMut<RenderMeshInstances>,
-    mut gpu_batched_instance_buffers: ResMut<
-        BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>,
+    mut batched_instance_buffers: ResMut<
+        gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
     >,
     mut render_mesh_instance_queues: Local<Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>>,
     mut prev_render_mesh_instances: Local<EntityHashMap<RenderMeshInstanceGpu>>,
@@ -681,7 +679,7 @@ pub fn extract_meshes_for_gpu_building(
 
     collect_meshes_for_gpu_building(
         &mut render_mesh_instances,
-        &mut gpu_batched_instance_buffers,
+        &mut batched_instance_buffers,
         &mut render_mesh_instance_queues,
         &mut prev_render_mesh_instances,
     );
@@ -691,7 +689,7 @@ pub fn extract_meshes_for_gpu_building(
 /// uniforms are built.
 fn collect_meshes_for_gpu_building(
     render_mesh_instances: &mut RenderMeshInstances,
-    gpu_batched_instance_buffers: &mut BatchedGpuBuiltInstanceBuffers<
+    batched_instance_buffers: &mut gpu_preprocessing::BatchedInstanceBuffers<
         MeshUniform,
         MeshInputUniform,
     >,
@@ -707,11 +705,11 @@ fn collect_meshes_for_gpu_building(
         );
     };
 
-    let BatchedGpuBuiltInstanceBuffers {
+    let gpu_preprocessing::BatchedInstanceBuffers {
         ref mut current_input_buffer,
         ref mut previous_input_buffer,
         ..
-    } = gpu_batched_instance_buffers;
+    } = batched_instance_buffers;
 
     // Swap buffers.
     mem::swap(current_input_buffer, previous_input_buffer);
@@ -1503,9 +1501,11 @@ pub fn prepare_mesh_bind_group(
     mut groups: ResMut<MeshBindGroups>,
     mesh_pipeline: Res<MeshPipeline>,
     render_device: Res<RenderDevice>,
-    cpu_batched_instance_buffer: Option<Res<BatchedCpuBuiltInstanceBuffer<MeshUniform>>>,
+    cpu_batched_instance_buffer: Option<
+        Res<no_gpu_preprocessing::BatchedInstanceBuffer<MeshUniform>>,
+    >,
     gpu_batched_instance_buffers: Option<
-        Res<BatchedGpuBuiltInstanceBuffers<MeshUniform, MeshInputUniform>>,
+        Res<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
     >,
     skins_uniform: Res<SkinUniform>,
     weights_uniform: Res<MorphUniform>,
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
new file mode 100644
index 0000000000000..2e94b46a0b0cb
--- /dev/null
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -0,0 +1,326 @@
+//! Batching functionality when GPU preprocessing is in use.
+
+use std::marker::PhantomData;
+
+use bevy_ecs::{
+    entity::Entity,
+    query::With,
+    system::{Query, Res, ResMut, Resource, StaticSystemParam},
+};
+use bevy_encase_derive::ShaderType;
+use bevy_utils::EntityHashMap;
+use bytemuck::{Pod, Zeroable};
+use smallvec::smallvec;
+use wgpu::{BindingResource, BufferUsages};
+
+use crate::{
+    render_phase::{
+        BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
+        SortedPhaseItem, SortedRenderPhase,
+    },
+    render_resource::{BufferVec, GpuArrayBufferIndex, GpuArrayBufferable, UninitBufferVec},
+    renderer::{RenderDevice, RenderQueue},
+    view::ViewTarget,
+};
+
+use super::{BatchMeta, GetFullBatchData};
+
+/// The GPU buffers holding the data needed to render batches.
+///
+/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
+/// `BD` type parameter in that mode.
+///
+/// We have a separate *buffer data input* type (`BDI`) here, which a compute
+/// shader is expected to expand to the full buffer data (`BD`) type. GPU
+/// uniform building is generally faster and uses less GPU bus bandwidth, but
+/// only implemented for some pipelines (for example, not in the 2D pipeline at
+/// present) and only when compute shader is available.
+#[derive(Resource)]
+pub struct BatchedInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    /// A storage area for the buffer data that the GPU compute shader is
+    /// expected to write to.
+    ///
+    /// There will be one entry for each index.
+    pub data_buffer: UninitBufferVec<BD>,
+
+    /// The index of the buffer data in the current input buffer that
+    /// corresponds to each instance.
+    ///
+    /// This is keyed off each view. Each view has a separate buffer.
+    pub work_item_buffers: EntityHashMap<Entity, BufferVec<PreprocessWorkItem>>,
+
+    /// The uniform data inputs for the current frame.
+    ///
+    /// These are uploaded during the extraction phase.
+    pub current_input_buffer: BufferVec<BDI>,
+
+    /// The uniform data inputs for the previous frame.
+    ///
+    /// The indices don't generally line up between `current_input_buffer`
+    /// and `previous_input_buffer`, because, among other reasons, entities
+    /// can spawn or despawn between frames. Instead, each current buffer
+    /// data input uniform is expected to contain the index of the
+    /// corresponding buffer data input uniform in this list.
+    pub previous_input_buffer: BufferVec<BDI>,
+}
+
+/// One invocation of the preprocessing shader: i.e. one mesh instance in a
+/// view.
+#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
+#[repr(C)]
+pub struct PreprocessWorkItem {
+    /// The index of the batch input data in the input buffer that the shader
+    /// reads from.
+    pub input_index: u32,
+    /// The index of the `MeshUniform` in the output buffer that we write to.
+    pub output_index: u32,
+}
+
+impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    /// Creates new buffers.
+    pub fn new() -> Self {
+        BatchedInstanceBuffers {
+            data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
+            work_item_buffers: EntityHashMap::default(),
+            current_input_buffer: BufferVec::new(BufferUsages::STORAGE),
+            previous_input_buffer: BufferVec::new(BufferUsages::STORAGE),
+        }
+    }
+
+    /// Returns the binding of the buffer that contains the per-instance data.
+    ///
+    /// This buffer needs to be filled in via a compute shader.
+    pub fn instance_data_binding(&self) -> Option<BindingResource> {
+        self.data_buffer
+            .buffer()
+            .map(|buffer| buffer.as_entire_binding())
+    }
+}
+
+impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// A system that removes GPU preprocessing work item buffers that correspond to
+/// deleted [`ViewTarget`]s.
+///
+/// This is a separate system from [`clear_batched_instance_buffers`] because
+/// [`ViewTarget`]s aren't created until after the extraction phase is
+/// completed.
+pub fn delete_old_work_item_buffers<GFBD>(
+    gpu_batched_instance_buffers: Option<
+        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    >,
+    view_targets: Query<Entity, With<ViewTarget>>,
+) where
+    GFBD: GetFullBatchData,
+{
+    if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
+        gpu_batched_instance_buffers
+            .work_item_buffers
+            .retain(|entity, _| view_targets.contains(*entity));
+    }
+}
+
+/// Batch the items in a sorted render phase, when GPU instance buffer building
+/// isn't in use. This means comparing metadata needed to draw each phase item
+/// and trying to combine the draws into a batch.
+pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
+    gpu_batched_instance_buffers: Option<
+        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    >,
+    mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
+    param: StaticSystemParam<GFBD::Param>,
+) where
+    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    let system_param_item = param.into_inner();
+
+    let process_item =
+        |item: &mut I,
+         data_buffer: &mut UninitBufferVec<GFBD::BufferData>,
+         work_item_buffer: &mut BufferVec<PreprocessWorkItem>| {
+            let (input_index, compare_data) =
+                GFBD::get_batch_input_index(&system_param_item, item.entity())?;
+            let output_index = data_buffer.add() as u32;
+
+            work_item_buffer.push(PreprocessWorkItem {
+                input_index,
+                output_index,
+            });
+
+            *item.batch_range_mut() = output_index..output_index + 1;
+
+            if I::AUTOMATIC_BATCHING {
+                compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
+            } else {
+                None
+            }
+        };
+
+    // We only process GPU-built batch data in this function.
+    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
+        return;
+    };
+    let BatchedInstanceBuffers {
+        ref mut data_buffer,
+        ref mut work_item_buffers,
+        ..
+    } = gpu_batched_instance_buffers.into_inner();
+
+    for (view, mut phase) in &mut views {
+        // Create the work item buffer if necessary; otherwise, just mark it as
+        // used this frame.
+        let work_item_buffer = work_item_buffers
+            .entry(view)
+            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+
+        let items = phase.items.iter_mut().map(|item| {
+            let batch_data = process_item(item, data_buffer, work_item_buffer);
+            (item.batch_range_mut(), batch_data)
+        });
+        items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
+            if batch_meta.is_some() && prev_batch_meta == batch_meta {
+                start_range.end = range.end;
+                (start_range, prev_batch_meta)
+            } else {
+                (range, batch_meta)
+            }
+        });
+    }
+}
+
+/// Creates batches for a render phase that uses bins.
+pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
+    gpu_batched_instance_buffers: Option<
+        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    >,
+    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
+    param: StaticSystemParam<GFBD::Param>,
+) where
+    BPI: BinnedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    let system_param_item = param.into_inner();
+
+    // We only process GPU-built batch data in this function.
+    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
+        return;
+    };
+    let BatchedInstanceBuffers {
+        ref mut data_buffer,
+        ref mut work_item_buffers,
+        ..
+    } = gpu_batched_instance_buffers.into_inner();
+
+    for (view, mut phase) in &mut views {
+        let phase = &mut *phase; // Borrow checker.
+
+        // Create the work item buffer if necessary; otherwise, just mark it as
+        // used this frame.
+        let work_item_buffer = work_item_buffers
+            .entry(view)
+            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+
+        // Prepare batchables.
+
+        for key in &phase.batchable_keys {
+            let mut batch: Option<BinnedRenderPhaseBatch> = None;
+            for &entity in &phase.batchable_values[key] {
+                let Some(input_index) =
+                    GFBD::get_binned_batch_input_index(&system_param_item, entity)
+                else {
+                    continue;
+                };
+                let output_index = data_buffer.add() as u32;
+
+                work_item_buffer.push(PreprocessWorkItem {
+                    input_index,
+                    output_index,
+                });
+
+                batch
+                    .get_or_insert(BinnedRenderPhaseBatch {
+                        representative_entity: entity,
+                        instance_range: output_index..output_index,
+                        dynamic_offset: None,
+                    })
+                    .instance_range
+                    .end = output_index + 1;
+            }
+
+            if let Some(batch) = batch {
+                phase.batch_sets.push(smallvec![batch]);
+            }
+        }
+
+        // Prepare unbatchables.
+        for key in &phase.unbatchable_keys {
+            let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
+            for &entity in &unbatchables.entities {
+                let Some(input_index) =
+                    GFBD::get_binned_batch_input_index(&system_param_item, entity)
+                else {
+                    continue;
+                };
+                let output_index = data_buffer.add() as u32;
+
+                work_item_buffer.push(PreprocessWorkItem {
+                    input_index,
+                    output_index,
+                });
+
+                unbatchables
+                    .buffer_indices
+                    .add(GpuArrayBufferIndex::<GFBD::BufferData> {
+                        index: output_index,
+                        dynamic_offset: None,
+                        element_type: PhantomData,
+                    });
+            }
+        }
+    }
+}
+
+/// A system that writes all instance buffers to the GPU.
+pub fn write_batched_instance_buffers<GFBD>(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    gpu_batched_instance_buffers: Option<
+        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    >,
+) where
+    GFBD: GetFullBatchData,
+{
+    let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
+        return;
+    };
+
+    gpu_batched_instance_buffers
+        .data_buffer
+        .write_buffer(&render_device);
+    gpu_batched_instance_buffers
+        .current_input_buffer
+        .write_buffer(&render_device, &render_queue);
+    // There's no need to write `previous_input_buffer`, as we wrote
+    // that on the previous frame, and it hasn't changed.
+
+    for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() {
+        work_item_buffer.write_buffer(&render_device, &render_queue);
+    }
+}
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 335cd68c753c8..a06410ed7e9b9 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -1,33 +1,21 @@
-use std::marker::PhantomData;
-
-use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::{
     component::Component,
     entity::Entity,
-    prelude::Res,
-    query::With,
-    system::{Query, ResMut, Resource, StaticSystemParam, SystemParam, SystemParamItem},
+    system::{Query, ResMut, SystemParam, SystemParamItem},
 };
-use bevy_utils::EntityHashMap;
-use bytemuck::{Pod, Zeroable};
-use encase::ShaderType;
+use bytemuck::Pod;
 use nonmax::NonMaxU32;
-use smallvec::{smallvec, SmallVec};
-use wgpu::{BindingResource, BufferUsages};
 
 use crate::{
     render_phase::{
-        BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        DrawFunctionId, SortedPhaseItem, SortedRenderPhase,
-    },
-    render_resource::{
-        BufferVec, CachedRenderPipelineId, GpuArrayBuffer, GpuArrayBufferIndex, GpuArrayBufferable,
-        UninitBufferVec,
+        BinnedPhaseItem, BinnedRenderPhase, CachedRenderPipelinePhaseItem, DrawFunctionId,
     },
-    renderer::{RenderDevice, RenderQueue},
-    view::ViewTarget,
+    render_resource::{CachedRenderPipelineId, GpuArrayBufferable},
 };
 
+pub mod gpu_preprocessing;
+pub mod no_gpu_preprocessing;
+
 /// Add this component to mesh entities to disable automatic batching
 #[derive(Component)]
 pub struct NoAutomaticBatching;
@@ -69,122 +57,6 @@ impl<T: PartialEq> BatchMeta<T> {
     }
 }
 
-/// The GPU buffers holding the data needed to render batches.
-///
-/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
-/// `BD` type parameter in that mode.
-///
-/// There are two setups here, one for CPU uniform building and one for GPU
-/// uniform building. The CPU uniform setup is simple: there's one *buffer data*
-/// (`BD`) type per instance. GPU uniform building has a separate *buffer data
-/// input* type (`BDI`), which a compute shader is expected to expand to the
-/// full buffer data (`BD`) type. GPU uniform building is generally faster and
-/// uses less GPU bus bandwidth, but only implemented for some pipelines (for
-/// example, not in the 2D pipeline at present) and only when compute shader is
-/// available.
-#[derive(Resource, Deref, DerefMut)]
-pub struct BatchedCpuBuiltInstanceBuffer<BD>(pub GpuArrayBuffer<BD>)
-where
-    BD: GpuArrayBufferable + Sync + Send + 'static;
-
-#[derive(Resource)]
-pub struct BatchedGpuBuiltInstanceBuffers<BD, BDI>
-where
-    BD: GpuArrayBufferable + Sync + Send + 'static,
-    BDI: Pod,
-{
-    /// A storage area for the buffer data that the GPU compute shader is
-    /// expected to write to.
-    ///
-    /// There will be one entry for each index.
-    pub data_buffer: UninitBufferVec<BD>,
-
-    /// The index of the buffer data in the current input buffer that
-    /// corresponds to each instance.
-    ///
-    /// This is keyed off each view. Each view has a separate buffer.
-    pub work_item_buffers: EntityHashMap<Entity, BufferVec<PreprocessWorkItem>>,
-
-    /// The uniform data inputs for the current frame.
-    ///
-    /// These are uploaded during the extraction phase.
-    pub current_input_buffer: BufferVec<BDI>,
-
-    /// The uniform data inputs for the previous frame.
-    ///
-    /// The indices don't generally line up between `current_input_buffer`
-    /// and `previous_input_buffer`, because, among other reasons, entities
-    /// can spawn or despawn between frames. Instead, each current buffer
-    /// data input uniform is expected to contain the index of the
-    /// corresponding buffer data input uniform in this list.
-    pub previous_input_buffer: BufferVec<BDI>,
-}
-
-/// One invocation of the preprocessing shader: i.e. one mesh instance in a
-/// view.
-#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
-#[repr(C)]
-pub struct PreprocessWorkItem {
-    /// The index of the batch input data in the input buffer that the shader
-    /// reads from.
-    pub input_index: u32,
-    /// The index of the `MeshUniform` in the output buffer that we write to.
-    pub output_index: u32,
-}
-
-impl<BD> BatchedCpuBuiltInstanceBuffer<BD>
-where
-    BD: GpuArrayBufferable + Sync + Send + 'static,
-{
-    /// Creates new buffers.
-    pub fn new(render_device: &RenderDevice) -> Self {
-        BatchedCpuBuiltInstanceBuffer(GpuArrayBuffer::new(render_device))
-    }
-
-    /// Returns the binding of the buffer that contains the per-instance data.
-    ///
-    /// If we're in the GPU instance buffer building mode, this buffer needs to
-    /// be filled in via a compute shader.
-    pub fn instance_data_binding(&self) -> Option<BindingResource> {
-        self.binding()
-    }
-}
-
-impl<BD, BDI> BatchedGpuBuiltInstanceBuffers<BD, BDI>
-where
-    BD: GpuArrayBufferable + Sync + Send + 'static,
-    BDI: Pod,
-{
-    /// Creates new buffers.
-    pub fn new() -> Self {
-        BatchedGpuBuiltInstanceBuffers {
-            data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
-            work_item_buffers: EntityHashMap::default(),
-            current_input_buffer: BufferVec::new(BufferUsages::STORAGE),
-            previous_input_buffer: BufferVec::new(BufferUsages::STORAGE),
-        }
-    }
-
-    /// Returns the binding of the buffer that contains the per-instance data.
-    ///
-    /// This buffer needs to be filled in via a compute shader.
-    pub fn instance_data_binding(&self) -> Option<BindingResource> {
-        self.data_buffer
-            .buffer()
-            .map(|buffer| buffer.as_entire_binding())
-    }
-}
-
-impl<BD, BDI> Default for BatchedGpuBuiltInstanceBuffers<BD, BDI>
-where
-    BD: GpuArrayBufferable + Sync + Send + 'static,
-    BDI: Pod,
-{
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 /// A trait to support getting data used for batching draw commands via phase
 /// items.
 ///
@@ -268,9 +140,11 @@ pub trait GetFullBatchData: GetBatchData {
 /// use, the extraction phase will write to the mesh input uniform buffers
 /// directly, so the buffers need to be cleared before then.
 pub fn clear_batched_instance_buffers<GFBD>(
-    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GFBD::BufferData>>>,
+    cpu_batched_instance_buffer: Option<
+        ResMut<no_gpu_preprocessing::BatchedInstanceBuffer<GFBD::BufferData>>,
+    >,
     gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+        ResMut<gpu_preprocessing::BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
     >,
 ) where
     GFBD: GetFullBatchData,
@@ -289,146 +163,6 @@ pub fn clear_batched_instance_buffers<GFBD>(
     }
 }
 
-/// A system that removes GPU preprocessing work item buffers that correspond to
-/// deleted [`ViewTarget`]s.
-///
-/// This is a separate system from [`clear_batched_instance_buffers`] because
-/// [`ViewTarget`]s aren't created until after the extraction phase is
-/// completed.
-pub fn delete_old_work_item_buffers<GFBD>(
-    gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
-    >,
-    view_targets: Query<Entity, With<ViewTarget>>,
-) where
-    GFBD: GetFullBatchData,
-{
-    if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
-        gpu_batched_instance_buffers
-            .work_item_buffers
-            .retain(|entity, _| view_targets.contains(*entity));
-    }
-}
-
-/// Batch the items in a sorted render phase, when GPU instance buffer building
-/// isn't in use. This means comparing metadata needed to draw each phase item
-/// and trying to combine the draws into a batch.
-pub fn batch_and_prepare_sorted_render_phase_no_gpu_preprocessing<I, GBD>(
-    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GBD::BufferData>>>,
-    mut views: Query<&mut SortedRenderPhase<I>>,
-    param: StaticSystemParam<GBD::Param>,
-) where
-    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
-    GBD: GetBatchData,
-{
-    let system_param_item = param.into_inner();
-
-    let process_item = |item: &mut I, buffer: &mut GpuArrayBuffer<GBD::BufferData>| {
-        let (buffer_data, compare_data) = GBD::get_batch_data(&system_param_item, item.entity())?;
-        let buffer_index = buffer.push(buffer_data);
-
-        let index = buffer_index.index;
-        *item.batch_range_mut() = index..index + 1;
-        *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
-
-        if I::AUTOMATIC_BATCHING {
-            compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
-        } else {
-            None
-        }
-    };
-
-    // We only process CPU-built batch data in this function.
-    let Some(cpu_batched_instance_buffers) = cpu_batched_instance_buffer else {
-        return;
-    };
-    let cpu_batched_instance_buffers = cpu_batched_instance_buffers.into_inner();
-
-    for mut phase in &mut views {
-        let items = phase.items.iter_mut().map(|item| {
-            let batch_data = process_item(item, cpu_batched_instance_buffers);
-            (item.batch_range_mut(), batch_data)
-        });
-        items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
-            if batch_meta.is_some() && prev_batch_meta == batch_meta {
-                start_range.end = range.end;
-                (start_range, prev_batch_meta)
-            } else {
-                (range, batch_meta)
-            }
-        });
-    }
-}
-
-/// Batch the items in a sorted render phase, when GPU instance buffer building
-/// isn't in use. This means comparing metadata needed to draw each phase item
-/// and trying to combine the draws into a batch.
-pub fn batch_and_prepare_sorted_render_phase_for_gpu_preprocessing<I, GFBD>(
-    gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
-    >,
-    mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
-    param: StaticSystemParam<GFBD::Param>,
-) where
-    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
-    GFBD: GetFullBatchData,
-{
-    let system_param_item = param.into_inner();
-
-    let process_item =
-        |item: &mut I,
-         data_buffer: &mut UninitBufferVec<GFBD::BufferData>,
-         work_item_buffer: &mut BufferVec<PreprocessWorkItem>| {
-            let (input_index, compare_data) =
-                GFBD::get_batch_input_index(&system_param_item, item.entity())?;
-            let output_index = data_buffer.add() as u32;
-
-            work_item_buffer.push(PreprocessWorkItem {
-                input_index,
-                output_index,
-            });
-
-            *item.batch_range_mut() = output_index..output_index + 1;
-
-            if I::AUTOMATIC_BATCHING {
-                compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
-            } else {
-                None
-            }
-        };
-
-    // We only process GPU-built batch data in this function.
-    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
-        return;
-    };
-    let BatchedGpuBuiltInstanceBuffers {
-        ref mut data_buffer,
-        ref mut work_item_buffers,
-        ..
-    } = gpu_batched_instance_buffers.into_inner();
-
-    for (view, mut phase) in &mut views {
-        // Create the work item buffer if necessary; otherwise, just mark it as
-        // used this frame.
-        let work_item_buffer = work_item_buffers
-            .entry(view)
-            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
-
-        let items = phase.items.iter_mut().map(|item| {
-            let batch_data = process_item(item, data_buffer, work_item_buffer);
-            (item.batch_range_mut(), batch_data)
-        });
-        items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
-            if batch_meta.is_some() && prev_batch_meta == batch_meta {
-                start_range.end = range.end;
-                (start_range, prev_batch_meta)
-            } else {
-                (range, batch_meta)
-            }
-        });
-    }
-}
-
 /// Sorts a render phase that uses bins.
 pub fn sort_binned_render_phase<BPI>(mut views: Query<&mut BinnedRenderPhase<BPI>>)
 where
@@ -439,204 +173,3 @@ where
         phase.unbatchable_keys.sort_unstable();
     }
 }
-
-/// Creates batches for a render phase that uses bins, when GPU batch data
-/// building isn't in use.
-pub fn batch_and_prepare_binned_render_phase_no_gpu_preprocessing<BPI, GFBD>(
-    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GFBD::BufferData>>>,
-    mut views: Query<&mut BinnedRenderPhase<BPI>>,
-    param: StaticSystemParam<GFBD::Param>,
-) where
-    BPI: BinnedPhaseItem,
-    GFBD: GetFullBatchData,
-{
-    let system_param_item = param.into_inner();
-
-    // We only process CPU-built batch data in this function.
-    let Some(mut buffer) = cpu_batched_instance_buffer else {
-        return;
-    };
-
-    for mut phase in &mut views {
-        let phase = &mut *phase; // Borrow checker.
-
-        // Prepare batchables.
-
-        for key in &phase.batchable_keys {
-            let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![];
-            for &entity in &phase.batchable_values[key] {
-                let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity)
-                else {
-                    continue;
-                };
-                let instance = buffer.push(buffer_data);
-
-                // If the dynamic offset has changed, flush the batch.
-                //
-                // This is the only time we ever have more than one batch per
-                // bin. Note that dynamic offsets are only used on platforms
-                // with no storage buffers.
-                if !batch_set.last().is_some_and(|batch| {
-                    batch.instance_range.end == instance.index
-                        && batch.dynamic_offset == instance.dynamic_offset
-                }) {
-                    batch_set.push(BinnedRenderPhaseBatch {
-                        representative_entity: entity,
-                        instance_range: instance.index..instance.index,
-                        dynamic_offset: instance.dynamic_offset,
-                    });
-                }
-
-                if let Some(batch) = batch_set.last_mut() {
-                    batch.instance_range.end = instance.index + 1;
-                }
-            }
-
-            phase.batch_sets.push(batch_set);
-        }
-
-        // Prepare unbatchables.
-        for key in &phase.unbatchable_keys {
-            let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
-            for &entity in &unbatchables.entities {
-                let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity)
-                else {
-                    continue;
-                };
-                let instance = buffer.push(buffer_data);
-                unbatchables.buffer_indices.add(instance);
-            }
-        }
-    }
-}
-
-/// Creates batches for a render phase that uses bins.
-pub fn batch_and_prepare_binned_render_phase_for_gpu_preprocessing<BPI, GFBD>(
-    gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
-    >,
-    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
-    param: StaticSystemParam<GFBD::Param>,
-) where
-    BPI: BinnedPhaseItem,
-    GFBD: GetFullBatchData,
-{
-    let system_param_item = param.into_inner();
-
-    // We only process GPU-built batch data in this function.
-    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
-        return;
-    };
-    let BatchedGpuBuiltInstanceBuffers {
-        ref mut data_buffer,
-        ref mut work_item_buffers,
-        ..
-    } = gpu_batched_instance_buffers.into_inner();
-
-    for (view, mut phase) in &mut views {
-        let phase = &mut *phase; // Borrow checker.
-
-        // Create the work item buffer if necessary; otherwise, just mark it as
-        // used this frame.
-        let work_item_buffer = work_item_buffers
-            .entry(view)
-            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
-
-        // Prepare batchables.
-
-        for key in &phase.batchable_keys {
-            let mut batch: Option<BinnedRenderPhaseBatch> = None;
-            for &entity in &phase.batchable_values[key] {
-                let Some(input_index) =
-                    GFBD::get_binned_batch_input_index(&system_param_item, entity)
-                else {
-                    continue;
-                };
-                let output_index = data_buffer.add() as u32;
-
-                work_item_buffer.push(PreprocessWorkItem {
-                    input_index,
-                    output_index,
-                });
-
-                batch
-                    .get_or_insert(BinnedRenderPhaseBatch {
-                        representative_entity: entity,
-                        instance_range: output_index..output_index,
-                        dynamic_offset: None,
-                    })
-                    .instance_range
-                    .end = output_index + 1;
-            }
-
-            if let Some(batch) = batch {
-                phase.batch_sets.push(smallvec![batch]);
-            }
-        }
-
-        // Prepare unbatchables.
-        for key in &phase.unbatchable_keys {
-            let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
-            for &entity in &unbatchables.entities {
-                let Some(input_index) =
-                    GFBD::get_binned_batch_input_index(&system_param_item, entity)
-                else {
-                    continue;
-                };
-                let output_index = data_buffer.add() as u32;
-
-                work_item_buffer.push(PreprocessWorkItem {
-                    input_index,
-                    output_index,
-                });
-
-                unbatchables
-                    .buffer_indices
-                    .add(GpuArrayBufferIndex::<GFBD::BufferData> {
-                        index: output_index,
-                        dynamic_offset: None,
-                        element_type: PhantomData,
-                    });
-            }
-        }
-    }
-}
-
-pub fn write_cpu_built_batched_instance_buffers<GBD>(
-    render_device: Res<RenderDevice>,
-    render_queue: Res<RenderQueue>,
-    cpu_batched_instance_buffer: Option<ResMut<BatchedCpuBuiltInstanceBuffer<GBD::BufferData>>>,
-) where
-    GBD: GetBatchData,
-{
-    if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
-        cpu_batched_instance_buffer.write_buffer(&render_device, &render_queue);
-    }
-}
-
-pub fn write_gpu_built_batched_instance_buffers<GFBD>(
-    render_device: Res<RenderDevice>,
-    render_queue: Res<RenderQueue>,
-    gpu_batched_instance_buffers: Option<
-        ResMut<BatchedGpuBuiltInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
-    >,
-) where
-    GFBD: GetFullBatchData,
-{
-    let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
-        return;
-    };
-
-    gpu_batched_instance_buffers
-        .data_buffer
-        .write_buffer(&render_device);
-    gpu_batched_instance_buffers
-        .current_input_buffer
-        .write_buffer(&render_device, &render_queue);
-    // There's no need to write `previous_input_buffer`, as we wrote
-    // that on the previous frame, and it hasn't changed.
-
-    for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() {
-        work_item_buffer.write_buffer(&render_device, &render_queue);
-    }
-}
diff --git a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
new file mode 100644
index 0000000000000..9b837757327ed
--- /dev/null
+++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
@@ -0,0 +1,177 @@
+//! Batching functionality when GPU preprocessing isn't in use.
+
+use bevy_derive::{Deref, DerefMut};
+use bevy_ecs::system::{Query, Res, ResMut, Resource, StaticSystemParam};
+use smallvec::{smallvec, SmallVec};
+use wgpu::BindingResource;
+
+use crate::{
+    render_phase::{
+        BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
+        SortedPhaseItem, SortedRenderPhase,
+    },
+    render_resource::{GpuArrayBuffer, GpuArrayBufferable},
+    renderer::{RenderDevice, RenderQueue},
+};
+
+use super::{BatchMeta, GetBatchData, GetFullBatchData};
+
+/// The GPU buffers holding the data needed to render batches.
+///
+/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
+/// `BD` type parameter in that mode.
+#[derive(Resource, Deref, DerefMut)]
+pub struct BatchedInstanceBuffer<BD>(pub GpuArrayBuffer<BD>)
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static;
+
+impl<BD> BatchedInstanceBuffer<BD>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+{
+    /// Creates a new buffer.
+    pub fn new(render_device: &RenderDevice) -> Self {
+        BatchedInstanceBuffer(GpuArrayBuffer::new(render_device))
+    }
+
+    /// Returns the binding of the buffer that contains the per-instance data.
+    ///
+    /// If we're in the GPU instance buffer building mode, this buffer needs to
+    /// be filled in via a compute shader.
+    pub fn instance_data_binding(&self) -> Option<BindingResource> {
+        self.binding()
+    }
+}
+
+/// Batch the items in a sorted render phase, when GPU instance buffer building
+/// isn't in use. This means comparing metadata needed to draw each phase item
+/// and trying to combine the draws into a batch.
+pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
+    cpu_batched_instance_buffer: Option<ResMut<BatchedInstanceBuffer<GBD::BufferData>>>,
+    mut views: Query<&mut SortedRenderPhase<I>>,
+    param: StaticSystemParam<GBD::Param>,
+) where
+    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    GBD: GetBatchData,
+{
+    let system_param_item = param.into_inner();
+
+    let process_item = |item: &mut I, buffer: &mut GpuArrayBuffer<GBD::BufferData>| {
+        let (buffer_data, compare_data) = GBD::get_batch_data(&system_param_item, item.entity())?;
+        let buffer_index = buffer.push(buffer_data);
+
+        let index = buffer_index.index;
+        *item.batch_range_mut() = index..index + 1;
+        *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
+
+        if I::AUTOMATIC_BATCHING {
+            compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
+        } else {
+            None
+        }
+    };
+
+    // We only process CPU-built batch data in this function.
+    let Some(cpu_batched_instance_buffers) = cpu_batched_instance_buffer else {
+        return;
+    };
+    let cpu_batched_instance_buffers = cpu_batched_instance_buffers.into_inner();
+
+    for mut phase in &mut views {
+        let items = phase.items.iter_mut().map(|item| {
+            let batch_data = process_item(item, cpu_batched_instance_buffers);
+            (item.batch_range_mut(), batch_data)
+        });
+        items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
+            if batch_meta.is_some() && prev_batch_meta == batch_meta {
+                start_range.end = range.end;
+                (start_range, prev_batch_meta)
+            } else {
+                (range, batch_meta)
+            }
+        });
+    }
+}
+
+/// Creates batches for a render phase that uses bins, when GPU batch data
+/// building isn't in use.
+pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
+    cpu_batched_instance_buffer: Option<ResMut<BatchedInstanceBuffer<GFBD::BufferData>>>,
+    mut views: Query<&mut BinnedRenderPhase<BPI>>,
+    param: StaticSystemParam<GFBD::Param>,
+) where
+    BPI: BinnedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    let system_param_item = param.into_inner();
+
+    // We only process CPU-built batch data in this function.
+    let Some(mut buffer) = cpu_batched_instance_buffer else {
+        return;
+    };
+
+    for mut phase in &mut views {
+        let phase = &mut *phase; // Borrow checker.
+
+        // Prepare batchables.
+
+        for key in &phase.batchable_keys {
+            let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![];
+            for &entity in &phase.batchable_values[key] {
+                let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity)
+                else {
+                    continue;
+                };
+                let instance = buffer.push(buffer_data);
+
+                // If the dynamic offset has changed, flush the batch.
+                //
+                // This is the only time we ever have more than one batch per
+                // bin. Note that dynamic offsets are only used on platforms
+                // with no storage buffers.
+                if !batch_set.last().is_some_and(|batch| {
+                    batch.instance_range.end == instance.index
+                        && batch.dynamic_offset == instance.dynamic_offset
+                }) {
+                    batch_set.push(BinnedRenderPhaseBatch {
+                        representative_entity: entity,
+                        instance_range: instance.index..instance.index,
+                        dynamic_offset: instance.dynamic_offset,
+                    });
+                }
+
+                if let Some(batch) = batch_set.last_mut() {
+                    batch.instance_range.end = instance.index + 1;
+                }
+            }
+
+            phase.batch_sets.push(batch_set);
+        }
+
+        // Prepare unbatchables.
+        for key in &phase.unbatchable_keys {
+            let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
+            for &entity in &unbatchables.entities {
+                let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity)
+                else {
+                    continue;
+                };
+                let instance = buffer.push(buffer_data);
+                unbatchables.buffer_indices.add(instance);
+            }
+        }
+    }
+}
+
+/// Writes the instance buffer data to the GPU.
+pub fn write_batched_instance_buffer<GBD>(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    cpu_batched_instance_buffer: Option<ResMut<BatchedInstanceBuffer<GBD::BufferData>>>,
+) where
+    GBD: GetBatchData,
+{
+    if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
+        cpu_batched_instance_buffer.write_buffer(&render_device, &render_queue);
+    }
+}
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index c22dc9c55380f..b584894ba7db2 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -38,7 +38,7 @@ use nonmax::NonMaxU32;
 pub use rangefinder::*;
 
 use crate::{
-    batching::{self, GetFullBatchData},
+    batching::{self, gpu_preprocessing, no_gpu_preprocessing, GetFullBatchData},
     render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache},
     Render, RenderApp, RenderSet,
 };
@@ -331,14 +331,8 @@ where
             (
                 batching::sort_binned_render_phase::<BPI>.in_set(RenderSet::PhaseSort),
                 (
-                    batching::batch_and_prepare_binned_render_phase_no_gpu_preprocessing::<
-                        BPI,
-                        GFBD,
-                    >,
-                    batching::batch_and_prepare_binned_render_phase_for_gpu_preprocessing::<
-                        BPI,
-                        GFBD,
-                    >,
+                    no_gpu_preprocessing::batch_and_prepare_binned_render_phase::<BPI, GFBD>,
+                    gpu_preprocessing::batch_and_prepare_binned_render_phase::<BPI, GFBD>,
                 )
                     .in_set(RenderSet::PrepareResources),
             ),
@@ -379,8 +373,8 @@ where
         render_app.add_systems(
             Render,
             (
-                batching::batch_and_prepare_sorted_render_phase_no_gpu_preprocessing::<SPI, GFBD>,
-                batching::batch_and_prepare_sorted_render_phase_for_gpu_preprocessing::<SPI, GFBD>,
+                no_gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>,
+                gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>,
             )
                 .in_set(RenderSet::PrepareResources),
         );
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index b2aee9a30ad2b..cd7765cee16b5 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -11,15 +11,12 @@ use bevy_ecs::{
 };
 use bevy_math::{Affine3, Vec4};
 use bevy_reflect::{std_traits::ReflectDefault, Reflect};
-use bevy_render::batching::{
-    write_cpu_built_batched_instance_buffers, BatchedCpuBuiltInstanceBuffer,
+use bevy_render::batching::no_gpu_preprocessing::{
+    batch_and_prepare_sorted_render_phase, write_batched_instance_buffer, BatchedInstanceBuffer,
 };
 use bevy_render::mesh::MeshVertexBufferLayoutRef;
 use bevy_render::{
-    batching::{
-        batch_and_prepare_sorted_render_phase_no_gpu_preprocessing, GetBatchData,
-        NoAutomaticBatching,
-    },
+    batching::{GetBatchData, NoAutomaticBatching},
     globals::{GlobalsBuffer, GlobalsUniform},
     mesh::{GpuBufferInfo, Mesh},
     render_asset::RenderAssets,
@@ -104,12 +101,9 @@ impl Plugin for Mesh2dRenderPlugin {
                 .add_systems(
                     Render,
                     (
-                        batch_and_prepare_sorted_render_phase_no_gpu_preprocessing::<
-                            Transparent2d,
-                            Mesh2dPipeline,
-                        >
+                        batch_and_prepare_sorted_render_phase::<Transparent2d, Mesh2dPipeline>
                             .in_set(RenderSet::PrepareResources),
-                        write_cpu_built_batched_instance_buffers::<Mesh2dPipeline>
+                        write_batched_instance_buffer::<Mesh2dPipeline>
                             .in_set(RenderSet::PrepareResourcesFlush),
                         prepare_mesh2d_bind_group.in_set(RenderSet::PrepareBindGroups),
                         prepare_mesh2d_view_bind_groups.in_set(RenderSet::PrepareBindGroups),
@@ -124,7 +118,7 @@ impl Plugin for Mesh2dRenderPlugin {
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
             let render_device = render_app.world().resource::<RenderDevice>();
             let batched_instance_buffer =
-                BatchedCpuBuiltInstanceBuffer::<Mesh2dUniform>::new(render_device);
+                BatchedInstanceBuffer::<Mesh2dUniform>::new(render_device);
 
             if let Some(per_object_buffer_batch_size) =
                 GpuArrayBuffer::<Mesh2dUniform>::batch_size(render_device)
@@ -580,7 +574,7 @@ pub fn prepare_mesh2d_bind_group(
     mut commands: Commands,
     mesh2d_pipeline: Res<Mesh2dPipeline>,
     render_device: Res<RenderDevice>,
-    mesh2d_uniforms: Res<BatchedCpuBuiltInstanceBuffer<Mesh2dUniform>>,
+    mesh2d_uniforms: Res<BatchedInstanceBuffer<Mesh2dUniform>>,
 ) {
     if let Some(binding) = mesh2d_uniforms.instance_data_binding() {
         commands.insert_resource(Mesh2dBindGroup {

From 138ea9446c416bb5ae496feb6d2a43ffc1491d56 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Fri, 5 Apr 2024 13:57:16 -0700
Subject: [PATCH 23/39] Address review comments

---
 crates/bevy_pbr/src/render/gpu_preprocess.rs  | 16 +++----
 crates/bevy_pbr/src/render/mesh.rs            | 48 +++++++++++--------
 .../bevy_pbr/src/render/mesh_preprocess.wgsl  |  2 +-
 .../src/batching/gpu_preprocessing.rs         | 35 +++++---------
 .../src/batching/no_gpu_preprocessing.rs      | 22 +++------
 crates/bevy_render/src/render_phase/mod.rs    | 25 ++++++++--
 6 files changed, 75 insertions(+), 73 deletions(-)

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index a8912e4d37aed..7661997f76b92 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -15,7 +15,7 @@ use bevy_ecs::{
     component::Component,
     entity::Entity,
     query::QueryState,
-    schedule::IntoSystemConfigs as _,
+    schedule::{common_conditions::resource_exists, IntoSystemConfigs as _},
     system::{lifetimeless::Read, Commands, Res, ResMut, Resource},
     world::{FromWorld, World},
 };
@@ -88,7 +88,11 @@ impl Plugin for GpuMeshPreprocessPlugin {
             Render,
             (
                 prepare_preprocess_pipeline.in_set(RenderSet::Prepare),
-                prepare_preprocess_bind_groups.in_set(RenderSet::PrepareBindGroups),
+                prepare_preprocess_bind_groups
+                    .run_if(
+                        resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+                    )
+                    .in_set(RenderSet::PrepareBindGroups),
             ),
         );
     }
@@ -244,14 +248,10 @@ pub fn prepare_preprocess_pipeline(
 pub fn prepare_preprocess_bind_groups(
     mut commands: Commands,
     render_device: Res<RenderDevice>,
-    batched_instance_buffers: Option<Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>>,
+    batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
     pipeline: Res<PreprocessPipeline>,
 ) {
-    // Grab the [`BatchedGpuBuiltInstanceBuffers`]. If we aren't using GPU mesh
-    // uniform building, bail out.
-    let Some(batched_instance_buffers) = batched_instance_buffers else {
-        return;
-    };
+    // Grab the `BatchedInstanceBuffers`.
     let BatchedInstanceBuffers {
         data_buffer: ref data_buffer_vec,
         work_item_buffers: ref index_buffers,
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index f3a9d6ecf2abe..732debb1441e2 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -156,13 +156,6 @@ impl Plugin for MeshRenderPlugin {
                 .add_systems(
                     Render,
                     (
-                        gpu_preprocessing::delete_old_work_item_buffers::<MeshPipeline>
-                            .in_set(RenderSet::ManageViews)
-                            .after(prepare_view_targets),
-                        no_gpu_preprocessing::write_batched_instance_buffer::<MeshPipeline>
-                            .in_set(RenderSet::PrepareResourcesFlush),
-                        gpu_preprocessing::write_batched_instance_buffers::<MeshPipeline>
-                            .in_set(RenderSet::PrepareResourcesFlush),
                         prepare_skins.in_set(RenderSet::PrepareResources),
                         prepare_morphs.in_set(RenderSet::PrepareResources),
                         prepare_mesh_bind_group.in_set(RenderSet::PrepareBindGroups),
@@ -171,15 +164,32 @@ impl Plugin for MeshRenderPlugin {
                 );
 
             if self.use_gpu_instance_buffer_builder {
-                render_app.add_systems(
-                    ExtractSchedule,
-                    extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
-                );
+                render_app
+                    .add_systems(
+                        ExtractSchedule,
+                        extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
+                    )
+                    .add_systems(
+                        Render,
+                        (
+                            gpu_preprocessing::write_batched_instance_buffers::<MeshPipeline>
+                                .in_set(RenderSet::PrepareResourcesFlush),
+                            gpu_preprocessing::delete_old_work_item_buffers::<MeshPipeline>
+                                .in_set(RenderSet::ManageViews)
+                                .after(prepare_view_targets),
+                        ),
+                    );
             } else {
-                render_app.add_systems(
-                    ExtractSchedule,
-                    extract_meshes_for_cpu_building.in_set(ExtractMeshesSet),
-                );
+                render_app
+                    .add_systems(
+                        ExtractSchedule,
+                        extract_meshes_for_cpu_building.in_set(ExtractMeshesSet),
+                    )
+                    .add_systems(
+                        Render,
+                        no_gpu_preprocessing::write_batched_instance_buffer::<MeshPipeline>
+                            .in_set(RenderSet::PrepareResourcesFlush),
+                    );
             }
         }
     }
@@ -281,7 +291,7 @@ pub struct MeshInputUniform {
     /// The index of this mesh's [`MeshInputUniform`] in the previous frame's
     /// buffer, if applicable.
     ///
-    /// This is used for TAA. If not present, this will be `!0`.
+    /// This is used for TAA. If not present, this will be `u32::MAX`.
     pub previous_input_index: u32,
 }
 
@@ -736,8 +746,8 @@ fn collect_meshes_for_gpu_building(
                 transform: builder.transform.to_transpose(),
                 lightmap_uv_rect: builder.lightmap_uv_rect,
                 flags: builder.mesh_flags.bits(),
-                previous_input_index: previous_input_index.unwrap_or(!0),
-            });
+                previous_input_index: previous_input_index.unwrap_or(u32::MAX),
+            }) as u32;
 
             // Record the [`RenderMeshInstance`].
             render_mesh_instances.insert(
@@ -745,7 +755,7 @@ fn collect_meshes_for_gpu_building(
                 RenderMeshInstanceGpu {
                     translation: builder.transform.translation,
                     shared: builder.shared,
-                    current_uniform_index: current_uniform_index as u32,
+                    current_uniform_index,
                 },
             );
         }
diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
index b0c76e7086c15..c4adaa5105623 100644
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -19,7 +19,7 @@ struct MeshInput {
     // Various flags.
     flags: u32,
     // The index of this mesh's `MeshInput` in the `previous_input` array, if
-    // applicable. If not present, this is `~0`.
+    // applicable. If not present, this is `u32::MAX`.
     previous_input_index: u32,
 }
 
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 2e94b46a0b0cb..82a68cdf9a46d 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -122,26 +122,24 @@ where
 /// [`ViewTarget`]s aren't created until after the extraction phase is
 /// completed.
 pub fn delete_old_work_item_buffers<GFBD>(
-    gpu_batched_instance_buffers: Option<
-        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    mut gpu_batched_instance_buffers: ResMut<
+        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
     >,
     view_targets: Query<Entity, With<ViewTarget>>,
 ) where
     GFBD: GetFullBatchData,
 {
-    if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
-        gpu_batched_instance_buffers
-            .work_item_buffers
-            .retain(|entity, _| view_targets.contains(*entity));
-    }
+    gpu_batched_instance_buffers
+        .work_item_buffers
+        .retain(|entity, _| view_targets.contains(*entity));
 }
 
 /// Batch the items in a sorted render phase, when GPU instance buffer building
 /// isn't in use. This means comparing metadata needed to draw each phase item
 /// and trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
-    gpu_batched_instance_buffers: Option<
-        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    gpu_batched_instance_buffers: ResMut<
+        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
     >,
     mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
     param: StaticSystemParam<GFBD::Param>,
@@ -174,9 +172,6 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
         };
 
     // We only process GPU-built batch data in this function.
-    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
-        return;
-    };
     let BatchedInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
@@ -207,8 +202,8 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
 
 /// Creates batches for a render phase that uses bins.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    gpu_batched_instance_buffers: Option<
-        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    gpu_batched_instance_buffers: ResMut<
+        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
     >,
     mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
     param: StaticSystemParam<GFBD::Param>,
@@ -218,10 +213,6 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 {
     let system_param_item = param.into_inner();
 
-    // We only process GPU-built batch data in this function.
-    let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
-        return;
-    };
     let BatchedInstanceBuffers {
         ref mut data_buffer,
         ref mut work_item_buffers,
@@ -301,16 +292,12 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 pub fn write_batched_instance_buffers<GFBD>(
     render_device: Res<RenderDevice>,
     render_queue: Res<RenderQueue>,
-    gpu_batched_instance_buffers: Option<
-        ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    mut gpu_batched_instance_buffers: ResMut<
+        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
     >,
 ) where
     GFBD: GetFullBatchData,
 {
-    let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers else {
-        return;
-    };
-
     gpu_batched_instance_buffers
         .data_buffer
         .write_buffer(&render_device);
diff --git a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
index 9b837757327ed..ca81eb6698b80 100644
--- a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
@@ -47,7 +47,7 @@ where
 /// isn't in use. This means comparing metadata needed to draw each phase item
 /// and trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
-    cpu_batched_instance_buffer: Option<ResMut<BatchedInstanceBuffer<GBD::BufferData>>>,
+    batched_instance_buffer: ResMut<BatchedInstanceBuffer<GBD::BufferData>>,
     mut views: Query<&mut SortedRenderPhase<I>>,
     param: StaticSystemParam<GBD::Param>,
 ) where
@@ -72,14 +72,11 @@ pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
     };
 
     // We only process CPU-built batch data in this function.
-    let Some(cpu_batched_instance_buffers) = cpu_batched_instance_buffer else {
-        return;
-    };
-    let cpu_batched_instance_buffers = cpu_batched_instance_buffers.into_inner();
+    let batched_instance_buffer = batched_instance_buffer.into_inner();
 
     for mut phase in &mut views {
         let items = phase.items.iter_mut().map(|item| {
-            let batch_data = process_item(item, cpu_batched_instance_buffers);
+            let batch_data = process_item(item, batched_instance_buffer);
             (item.batch_range_mut(), batch_data)
         });
         items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
@@ -96,7 +93,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
 /// Creates batches for a render phase that uses bins, when GPU batch data
 /// building isn't in use.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    cpu_batched_instance_buffer: Option<ResMut<BatchedInstanceBuffer<GFBD::BufferData>>>,
+    mut buffer: ResMut<BatchedInstanceBuffer<GFBD::BufferData>>,
     mut views: Query<&mut BinnedRenderPhase<BPI>>,
     param: StaticSystemParam<GFBD::Param>,
 ) where
@@ -105,11 +102,6 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 {
     let system_param_item = param.into_inner();
 
-    // We only process CPU-built batch data in this function.
-    let Some(mut buffer) = cpu_batched_instance_buffer else {
-        return;
-    };
-
     for mut phase in &mut views {
         let phase = &mut *phase; // Borrow checker.
 
@@ -167,11 +159,9 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 pub fn write_batched_instance_buffer<GBD>(
     render_device: Res<RenderDevice>,
     render_queue: Res<RenderQueue>,
-    cpu_batched_instance_buffer: Option<ResMut<BatchedInstanceBuffer<GBD::BufferData>>>,
+    mut cpu_batched_instance_buffer: ResMut<BatchedInstanceBuffer<GBD::BufferData>>,
 ) where
     GBD: GetBatchData,
 {
-    if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
-        cpu_batched_instance_buffer.write_buffer(&render_device, &render_queue);
-    }
+    cpu_batched_instance_buffer.write_buffer(&render_device, &render_queue);
 }
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index b584894ba7db2..494e59e9da828 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -38,7 +38,12 @@ use nonmax::NonMaxU32;
 pub use rangefinder::*;
 
 use crate::{
-    batching::{self, gpu_preprocessing, no_gpu_preprocessing, GetFullBatchData},
+    batching::{
+        self,
+        gpu_preprocessing::{self, BatchedInstanceBuffers},
+        no_gpu_preprocessing::{self, BatchedInstanceBuffer},
+        GetFullBatchData,
+    },
     render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache},
     Render, RenderApp, RenderSet,
 };
@@ -331,8 +336,13 @@ where
             (
                 batching::sort_binned_render_phase::<BPI>.in_set(RenderSet::PhaseSort),
                 (
-                    no_gpu_preprocessing::batch_and_prepare_binned_render_phase::<BPI, GFBD>,
-                    gpu_preprocessing::batch_and_prepare_binned_render_phase::<BPI, GFBD>,
+                    no_gpu_preprocessing::batch_and_prepare_binned_render_phase::<BPI, GFBD>
+                        .run_if(resource_exists::<BatchedInstanceBuffer<GFBD::BufferData>>),
+                    gpu_preprocessing::batch_and_prepare_binned_render_phase::<BPI, GFBD>.run_if(
+                        resource_exists::<
+                            BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+                        >,
+                    ),
                 )
                     .in_set(RenderSet::PrepareResources),
             ),
@@ -373,8 +383,13 @@ where
         render_app.add_systems(
             Render,
             (
-                no_gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>,
-                gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>,
+                no_gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>
+                    .run_if(resource_exists::<BatchedInstanceBuffer<GFBD::BufferData>>),
+                gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>.run_if(
+                    resource_exists::<
+                        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+                    >,
+                ),
             )
                 .in_set(RenderSet::PrepareResources),
         );

From 2f4fabaee20b5ce38f49be927139aec6199c40bb Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Fri, 5 Apr 2024 15:09:43 -0700
Subject: [PATCH 24/39] Doc check police

---
 crates/bevy_pbr/src/render/mesh.rs            |  9 ++--
 .../src/batching/gpu_preprocessing.rs         |  4 +-
 crates/bevy_render/src/batching/mod.rs        | 41 ++++++++++---------
 crates/bevy_render/src/render_phase/mod.rs    | 17 ++++----
 .../src/render_resource/buffer_vec.rs         |  4 +-
 5 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index a235f48dc1a7e..33b15a4ea05c6 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -406,9 +406,10 @@ pub struct RenderMeshInstanceShared {
 /// Information that is gathered during the parallel portion of mesh extraction
 /// when GPU mesh uniform building is enabled.
 ///
-/// From this, the [`MeshInputUniform`] and [`RenderMeshInstance`] are prepared.
+/// From this, the [`MeshInputUniform`] and [`RenderMeshInstanceGpu`] are
+/// prepared.
 pub struct RenderMeshInstanceGpuBuilder {
-    /// Data that will be placed on the [`RenderMeshInstance`].
+    /// Data that will be placed on the [`RenderMeshInstanceGpu`].
     pub shared: RenderMeshInstanceShared,
     /// The current transform.
     pub transform: Affine3,
@@ -695,8 +696,8 @@ pub fn extract_meshes_for_gpu_building(
     );
 }
 
-/// Creates the [`RenderMeshInstance`]s and [`MeshInputUniform`]s when GPU mesh
-/// uniforms are built.
+/// Creates the [`RenderMeshInstanceGpu`]s and [`MeshInputUniform`]s when GPU
+/// mesh uniforms are built.
 fn collect_meshes_for_gpu_building(
     render_mesh_instances: &mut RenderMeshInstances,
     batched_instance_buffers: &mut gpu_preprocessing::BatchedInstanceBuffers<
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 82a68cdf9a46d..94f9905da28dd 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -118,8 +118,8 @@ where
 /// A system that removes GPU preprocessing work item buffers that correspond to
 /// deleted [`ViewTarget`]s.
 ///
-/// This is a separate system from [`clear_batched_instance_buffers`] because
-/// [`ViewTarget`]s aren't created until after the extraction phase is
+/// This is a separate system from [`super::clear_batched_instance_buffers`]
+/// because [`ViewTarget`]s aren't created until after the extraction phase is
 /// completed.
 pub fn delete_old_work_item_buffers<GFBD>(
     mut gpu_batched_instance_buffers: ResMut<
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index a06410ed7e9b9..7691520ce35d7 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -71,17 +71,18 @@ pub trait GetBatchData {
     /// function id, per-instance data buffer dynamic offset and this data
     /// matches, the draws can be batched.
     type CompareData: PartialEq;
-    /// The per-instance data to be inserted into the [`GpuArrayBuffer`]
-    /// containing these data for all instances.
+    /// The per-instance data to be inserted into the
+    /// [`crate::render_resource::GpuArrayBuffer`] containing these data for all
+    /// instances.
     type BufferData: GpuArrayBufferable + Sync + Send + 'static;
-    /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
-    /// If the instance can be batched, also return the data used for
-    /// comparison when deciding whether draws can be batched, else return None
-    /// for the `CompareData`.
+    /// Get the per-instance data to be inserted into the
+    /// [`crate::render_resource::GpuArrayBuffer`].  If the instance can be
+    /// batched, also return the data used for comparison when deciding whether
+    /// draws can be batched, else return None for the `CompareData`.
     ///
     /// This is only called when building instance data on CPU. In the GPU
-    /// uniform building path, we use
-    /// [`GetBatchData::get_batch_preprocess_work_item`] instead.
+    /// instance data building path, we use
+    /// [`GetFullBatchData::get_batch_input_index`] instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
@@ -93,25 +94,25 @@ pub trait GetBatchData {
 ///
 /// This version allows for binning and GPU preprocessing.
 pub trait GetFullBatchData: GetBatchData {
-    /// The per-instance data that was inserted into the [`BufferVec`] during
-    /// extraction.
+    /// The per-instance data that was inserted into the
+    /// [`crate::render_resource::BufferVec`] during extraction.
     type BufferInputData: Pod + Sync + Send;
 
-    /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
+    /// Get the per-instance data to be inserted into the
+    /// [`crate::render_resource::GpuArrayBuffer`].
     ///
     /// This is only called when building uniforms on CPU. In the GPU instance
-    /// buffer building path, we use
-    /// [`GetBinnedBatchData::get_batch_input_index`]
+    /// buffer building path, we use [`GetFullBatchData::get_batch_input_index`]
     /// instead.
     fn get_binned_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<Self::BufferData>;
 
-    /// Returns the index of the [`GetBatchData::BufferInputData`] that the GPU
-    /// preprocessing phase will use.
+    /// Returns the index of the [`GetFullBatchData::BufferInputData`] that the
+    /// GPU preprocessing phase will use.
     ///
-    /// We already inserted the [`GetBatchData::BufferInputData`] during the
+    /// We already inserted the [`GetFullBatchData::BufferInputData`] during the
     /// extraction phase before we got here, so this function shouldn't need to
     /// look up any render data. If CPU instance buffer building is in use, this
     /// function will never be called.
@@ -120,10 +121,10 @@ pub trait GetFullBatchData: GetBatchData {
         query_item: Entity,
     ) -> Option<(u32, Option<Self::CompareData>)>;
 
-    /// Returns the index of the [`GetBatchData::BufferInputData`] that the GPU
-    /// preprocessing phase will use, for the binning path.
+    /// Returns the index of the [`GetFullBatchData::BufferInputData`] that the
+    /// GPU preprocessing phase will use, for the binning path.
     ///
-    /// We already inserted the [`GetBatchData::BufferInputData`] during the
+    /// We already inserted the [`GetFullBatchData::BufferInputData`] during the
     /// extraction phase before we got here, so this function shouldn't need to
     /// look up any render data. If CPU instance buffer building is in use, this
     /// function will never be called.
@@ -134,7 +135,7 @@ pub trait GetFullBatchData: GetBatchData {
 }
 
 /// A system that runs early in extraction and clears out all the
-/// [`BatchedInstanceBuffers`] for the frame.
+/// [`gpu_preprocessing::BatchedInstanceBuffers`] for the frame.
 ///
 /// We have to run this during extraction because, if GPU preprocessing is in
 /// use, the extraction phase will write to the mesh input uniform buffers
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index 494e59e9da828..58e1fa550f4e2 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -10,11 +10,10 @@
 //!
 //! To draw an entity, a corresponding [`PhaseItem`] has to be added to one or multiple of these
 //! render phases for each view that it is visible in.
-//! This must be done in the [`RenderSet::Queue`](crate::RenderSet::Queue).
-//! After that the render phase sorts them in the
-//! [`RenderSet::PhaseSort`](crate::RenderSet::PhaseSort).
-//! Finally the items are rendered using a single [`TrackedRenderPass`], during the
-//! [`RenderSet::Render`](crate::RenderSet::Render).
+//! This must be done in the [`RenderSet::Queue`].
+//! After that the render phase sorts them in the [`RenderSet::PhaseSort`].
+//! Finally the items are rendered using a single [`TrackedRenderPass`], during
+//! the [`RenderSet::Render`].
 //!
 //! Therefore each phase item is assigned a [`Draw`] function.
 //! These set up the state of the [`TrackedRenderPass`] (i.e. select the
@@ -568,12 +567,10 @@ where
 ///
 /// The data required for rendering an entity is extracted from the main world in the
 /// [`ExtractSchedule`](crate::ExtractSchedule).
-/// Then it has to be queued up for rendering during the
-/// [`RenderSet::Queue`](crate::RenderSet::Queue), by adding a corresponding phase item to
-/// a render phase.
+/// Then it has to be queued up for rendering during the [`RenderSet::Queue`],
+/// by adding a corresponding phase item to a render phase.
 /// Afterwards it will be possibly sorted and rendered automatically in the
-/// [`RenderSet::PhaseSort`](crate::RenderSet::PhaseSort) and
-/// [`RenderSet::Render`](crate::RenderSet::Render), respectively.
+/// [`RenderSet::PhaseSort`] and [`RenderSet::Render`], respectively.
 ///
 /// `PhaseItem`s come in two flavors: [`BinnedPhaseItem`]s and
 /// [`SortedPhaseItem`]s.
diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs
index 6969b052e346b..8a0f77daafb3d 100644
--- a/crates/bevy_render/src/render_resource/buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/buffer_vec.rs
@@ -171,8 +171,8 @@ impl<T: NoUninit> Extend<T> for BufferVec<T> {
 /// This type is useful when you're accumulating "output slots" for a GPU
 /// compute shader to write into.
 ///
-/// The type `T` need not be [`Pod`], unlike [`BufferVec`]; it only has to be
-/// [`GpuArrayBufferable`].
+/// The type `T` need not be [`NoUninit`], unlike [`BufferVec`]; it only has to
+/// be [`GpuArrayBufferable`].
 pub struct UninitBufferVec<T>
 where
     T: GpuArrayBufferable,

From d904138ffd7e6e5d1990414873c32a424a877b4f Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Fri, 5 Apr 2024 20:10:34 -0700
Subject: [PATCH 25/39] Fix deferred by moving the preprocess pipeline back

---
 crates/bevy_pbr/src/render/gpu_preprocess.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 7661997f76b92..e2c6c4c78020c 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -10,7 +10,7 @@ use std::num::NonZeroU64;
 
 use bevy_app::{App, Plugin};
 use bevy_asset::{load_internal_asset, Handle};
-use bevy_core_pipeline::core_3d::graph::Core3d;
+use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
 use bevy_ecs::{
     component::Component,
     entity::Entity,
@@ -105,7 +105,7 @@ impl Plugin for GpuMeshPreprocessPlugin {
         // Stitch the node in.
         render_app
             .add_render_graph_node::<GpuPreprocessNode>(Core3d, NodePbr::GpuPreprocess)
-            .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, NodePbr::ShadowPass))
+            .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, Node3d::Prepass))
             .init_resource::<PreprocessPipeline>()
             .init_resource::<SpecializedComputePipelines<PreprocessPipeline>>();
     }

From 88cda714ebd2739094884a387aeec125bd75acf7 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sat, 6 Apr 2024 20:10:09 -0700
Subject: [PATCH 26/39] Address a few review comments

---
 crates/bevy_pbr/src/render/gpu_preprocess.rs         | 11 +++--------
 crates/bevy_pbr/src/render/mesh.rs                   |  8 +++-----
 crates/bevy_render/src/batching/gpu_preprocessing.rs | 10 +++++-----
 crates/bevy_render/src/maths.wgsl                    |  5 ++---
 4 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index e2c6c4c78020c..2b17fbdef0aee 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -138,7 +138,7 @@ impl Node for GpuPreprocessNode {
         }) = world.get_resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
         else {
             error!(
-                "Attempted to preprocess meshes on GPU, but `GpuBuilt` batched instance buffers \
+                "Attempted to preprocess meshes on GPU, but GPU-built batched instance buffers \
                 weren't available"
             );
             return Ok(());
@@ -148,7 +148,7 @@ impl Node for GpuPreprocessNode {
         let preprocess_pipeline = world.resource::<PreprocessPipeline>();
 
         let Some(preprocess_pipeline_id) = preprocess_pipeline.pipeline_id else {
-            warn!("The build mesh uniforms pipeline wasn't uploaded");
+            warn!("The build mesh uniforms pipeline wasn't created");
             return Ok(());
         };
 
@@ -176,7 +176,7 @@ impl Node for GpuPreprocessNode {
             };
 
             compute_pass.set_bind_group(0, &bind_group.0, &[]);
-            let workgroup_count = div_round_up(index_buffer.len(), WORKGROUP_SIZE);
+            let workgroup_count = index_buffer.len().div_ceil(WORKGROUP_SIZE);
             compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
         }
 
@@ -298,8 +298,3 @@ pub fn prepare_preprocess_bind_groups(
             )));
     }
 }
-
-/// Returns `a / b`, rounded toward positive infinity.
-fn div_round_up(a: usize, b: usize) -> usize {
-    (a + b - 1) / b
-}
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 33b15a4ea05c6..257b4259f66fa 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -926,8 +926,7 @@ impl GetFullBatchData for MeshPipeline {
         // This should only be called during GPU building.
         let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
             error!(
-                "`get_batch_index` should never be called in CPU mesh uniform \
-                building mode"
+                "`get_batch_input_index` should never be called in CPU mesh uniform building mode"
             );
             return None;
         };
@@ -951,8 +950,7 @@ impl GetFullBatchData for MeshPipeline {
     ) -> Option<Self::BufferData> {
         let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
             error!(
-                "`get_batch_data` should never be called in GPU mesh uniform \
-                building mode"
+                "`get_binned_batch_data` should never be called in GPU mesh uniform building mode"
             );
             return None;
         };
@@ -972,7 +970,7 @@ impl GetFullBatchData for MeshPipeline {
         // This should only be called during GPU building.
         let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
             error!(
-                "`get_batch_index` should never be called in CPU mesh uniform \
+                "`get_binned_batch_input_index` should never be called in CPU mesh uniform \
                 building mode"
             );
             return None;
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 94f9905da28dd..2f9b52cbf2252 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -32,9 +32,9 @@ use super::{BatchMeta, GetFullBatchData};
 ///
 /// We have a separate *buffer data input* type (`BDI`) here, which a compute
 /// shader is expected to expand to the full buffer data (`BD`) type. GPU
-/// uniform building is generally faster and uses less GPU bus bandwidth, but
-/// only implemented for some pipelines (for example, not in the 2D pipeline at
-/// present) and only when compute shader is available.
+/// uniform building is generally faster and uses less system RAM to VRAM bus
+/// bandwidth, but only implemented for some pipelines (for example, not in the
+/// 2D pipeline at present) and only when compute shader is available.
 #[derive(Resource)]
 pub struct BatchedInstanceBuffers<BD, BDI>
 where
@@ -135,8 +135,8 @@ pub fn delete_old_work_item_buffers<GFBD>(
 }
 
 /// Batch the items in a sorted render phase, when GPU instance buffer building
-/// isn't in use. This means comparing metadata needed to draw each phase item
-/// and trying to combine the draws into a batch.
+/// is in use. This means comparing metadata needed to draw each phase item and
+/// trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
     gpu_batched_instance_buffers: ResMut<
         BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
diff --git a/crates/bevy_render/src/maths.wgsl b/crates/bevy_render/src/maths.wgsl
index 04b2f3a504c5f..53c254f3746f8 100644
--- a/crates/bevy_render/src/maths.wgsl
+++ b/crates/bevy_render/src/maths.wgsl
@@ -45,10 +45,9 @@ fn inverse_mat3x3(matrix: mat3x3<f32>) -> mat3x3<f32> {
 
 // Returns the inverse of an affine matrix.
 //
-// Recall that an affine matrix is just a 4x4 matrix with the last column of [0,
-// 0, 0, 1]; thus the inverse is well-defined.
+// https://en.wikipedia.org/wiki/Affine_transformation#Groups
 fn inverse_affine3(affine: mat4x3<f32>) -> mat4x3<f32> {
     let matrix3 = affine3_to_mat3x3(affine);
     let inv_matrix3 = inverse_mat3x3(matrix3);
-    return mat4x3<f32>(inv_matrix3[0], inv_matrix3[1], inv_matrix3[2], -(matrix3 * affine[3]));
+    return mat4x3<f32>(inv_matrix3[0], inv_matrix3[1], inv_matrix3[2], -(inv_matrix3 * affine[3]));
 }

From 5ee2b970cb34c148f5fb9f33b52c7a891c994cd4 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sun, 7 Apr 2024 15:06:18 -0700
Subject: [PATCH 27/39] Address review comment

---
 crates/bevy_render/src/batching/gpu_preprocessing.rs | 10 ++++++++++
 crates/bevy_render/src/batching/mod.rs               |  8 +-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 2f9b52cbf2252..e3614eec0bc4f 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -103,6 +103,16 @@ where
             .buffer()
             .map(|buffer| buffer.as_entire_binding())
     }
+
+    /// Clears out the buffers in preparation for a new frame.
+    pub fn clear(&mut self) {
+        self.data_buffer.clear();
+        self.current_input_buffer.clear();
+        self.previous_input_buffer.clear();
+        for work_item_buffer in self.work_item_buffers.values_mut() {
+            work_item_buffer.clear();
+        }
+    }
 }
 
 impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 7691520ce35d7..b494faf049aa8 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -153,14 +153,8 @@ pub fn clear_batched_instance_buffers<GFBD>(
     if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
         cpu_batched_instance_buffer.clear();
     }
-
     if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
-        gpu_batched_instance_buffers.data_buffer.clear();
-        gpu_batched_instance_buffers.current_input_buffer.clear();
-        gpu_batched_instance_buffers.previous_input_buffer.clear();
-        for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() {
-            work_item_buffer.clear();
-        }
+        gpu_batched_instance_buffers.clear();
     }
 }
 

From 78a8199cba09e44cd23e24801a866f5ade0b1f36 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sun, 7 Apr 2024 15:26:09 -0700
Subject: [PATCH 28/39] Address some review comments

---
 crates/bevy_pbr/src/render/gpu_preprocess.rs | 24 ++++-----------
 crates/bevy_pbr/src/render/mesh.rs           | 32 +++++++++++++++++---
 crates/bevy_sprite/src/mesh2d/mesh.rs        |  1 -
 3 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 2b17fbdef0aee..c331957366922 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -32,7 +32,7 @@ use bevy_render::{
     renderer::{RenderContext, RenderDevice},
     Render, RenderApp, RenderSet,
 };
-use bevy_utils::tracing::{error, warn};
+use bevy_utils::tracing::warn;
 
 use crate::{graph::NodePbr, MeshInputUniform, MeshUniform};
 
@@ -130,19 +130,11 @@ impl Node for GpuPreprocessNode {
         render_context: &mut RenderContext<'w>,
         world: &'w World,
     ) -> Result<(), NodeRunError> {
-        // Grab the [`BatchedInstanceBuffers`]. If we aren't using GPU mesh
-        // uniform building, bail out.
-        let Some(BatchedInstanceBuffers {
+        // Grab the [`BatchedInstanceBuffers`].
+        let BatchedInstanceBuffers {
             work_item_buffers: ref index_buffers,
             ..
-        }) = world.get_resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
-        else {
-            error!(
-                "Attempted to preprocess meshes on GPU, but GPU-built batched instance buffers \
-                weren't available"
-            );
-            return Ok(());
-        };
+        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
 
         let pipeline_cache = world.resource::<PipelineCache>();
         let preprocess_pipeline = world.resource::<PreprocessPipeline>();
@@ -168,13 +160,9 @@ impl Node for GpuPreprocessNode {
 
         compute_pass.set_pipeline(preprocess_pipeline);
 
+        // Run the compute passes.
         for (view, bind_group) in self.view_query.iter_manual(world) {
-            // Grab the index buffer for this view.
-            let Some(index_buffer) = index_buffers.get(&view) else {
-                warn!("The preprocessing index buffer wasn't present");
-                return Ok(());
-            };
-
+            let index_buffer = &index_buffers[&view];
             compute_pass.set_bind_group(0, &bind_group.0, &[]);
             let workgroup_count = index_buffer.len().div_ceil(WORKGROUP_SIZE);
             compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 257b4259f66fa..a5cdee96a859a 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -1681,17 +1681,41 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
 
 pub struct DrawMesh;
 impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
-    type Param = (SRes<RenderAssets<Mesh>>, SRes<RenderMeshInstances>);
-    type ViewQuery = ();
+    type Param = (
+        SRes<RenderAssets<Mesh>>,
+        SRes<RenderMeshInstances>,
+        SRes<PipelineCache>,
+        SRes<PreprocessPipeline>,
+    );
+    type ViewQuery = Has<PreprocessBindGroup>;
     type ItemQuery = ();
     #[inline]
     fn render<'w>(
         item: &P,
-        _view: (),
+        has_preprocess_bind_group: ROQueryItem<Self::ViewQuery>,
         _item_query: Option<()>,
-        (meshes, mesh_instances): SystemParamItem<'w, '_, Self::Param>,
+        (meshes, mesh_instances, pipeline_cache, preprocess_pipeline): SystemParamItem<
+            'w,
+            '_,
+            Self::Param,
+        >,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
+        // If we're using GPU preprocessing, then we're dependent on that
+        // compute shader having been run, which of course can only happen if
+        // it's compiled. Otherwise, our mesh instance data won't be present.
+        if !has_preprocess_bind_group
+            || !preprocess_pipeline
+                .pipeline_id
+                .is_some_and(|preprocess_pipeline_id| {
+                    pipeline_cache
+                        .get_compute_pipeline(preprocess_pipeline_id)
+                        .is_some()
+                })
+        {
+            return RenderCommandResult::Failure;
+        }
+
         let meshes = meshes.into_inner();
         let mesh_instances = mesh_instances.into_inner();
 
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index cd7765cee16b5..417af54dde2e9 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -153,7 +153,6 @@ pub struct Mesh2dTransforms {
 }
 
 #[derive(ShaderType, Clone)]
-#[repr(C)]
 pub struct Mesh2dUniform {
     // Affine 4x3 matrix transposed to 3x4
     pub transform: [Vec4; 3],

From 87e3889206182a62c2133078a9e2e706bdc43442 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sun, 7 Apr 2024 16:47:15 -0700
Subject: [PATCH 29/39] Address more review comments

---
 crates/bevy_pbr/src/render/mesh.rs            | 87 ++++++++++++++-----
 .../src/batching/gpu_preprocessing.rs         | 51 ++++-------
 crates/bevy_render/src/batching/mod.rs        | 34 ++++++++
 .../src/batching/no_gpu_preprocessing.rs      | 38 +++-----
 4 files changed, 123 insertions(+), 87 deletions(-)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index a5cdee96a859a..21f9b39fc0efc 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -5,7 +5,7 @@ use bevy_core_pipeline::{
     core_3d::{AlphaMask3d, Opaque3d, Transmissive3d, Transparent3d, CORE_3D_DEPTH_FORMAT},
     deferred::{AlphaMask3dDeferred, Opaque3dDeferred},
 };
-use bevy_derive::Deref;
+use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::entity::EntityHashMap;
 use bevy_ecs::{
     prelude::*,
@@ -471,50 +471,89 @@ impl RenderMeshInstanceShared {
 /// [`MeshUniform`] building is in use.
 #[derive(Resource)]
 pub enum RenderMeshInstances {
-    /// Information needed when using CPU mesh uniform building.
-    CpuBuilding(EntityHashMap<RenderMeshInstanceCpu>),
-    /// Information needed when using GPU mesh uniform building.
-    GpuBuilding(EntityHashMap<RenderMeshInstanceGpu>),
+    /// Information needed when using CPU mesh instance data building.
+    CpuBuilding(RenderMeshInstancesCpu),
+    /// Information needed when using GPU mesh instance data building.
+    GpuBuilding(RenderMeshInstancesGpu),
 }
 
+/// Information that the render world keeps about each entity that contains a
+/// mesh, when using CPU mesh instance data building.
+#[derive(Default, Deref, DerefMut)]
+struct RenderMeshInstancesCpu(EntityHashMap<RenderMeshInstanceCpu>);
+
+/// Information that the render world keeps about each entity that contains a
+/// mesh, when using GPU mesh instance data building.
+#[derive(Default, Deref, DerefMut)]
+struct RenderMeshInstancesGpu(EntityHashMap<RenderMeshInstanceGpu>);
+
 impl RenderMeshInstances {
+    /// Creates a new [`RenderMeshInstances`] instance.
     fn new(use_gpu_instance_buffer_builder: bool) -> RenderMeshInstances {
         if use_gpu_instance_buffer_builder {
-            RenderMeshInstances::GpuBuilding(EntityHashMap::default())
+            RenderMeshInstances::GpuBuilding(RenderMeshInstancesGpu::default())
         } else {
-            RenderMeshInstances::CpuBuilding(EntityHashMap::default())
+            RenderMeshInstances::CpuBuilding(RenderMeshInstancesCpu::default())
         }
     }
 
     /// Returns the ID of the mesh asset attached to the given entity, if any.
     pub(crate) fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
         match *self {
-            RenderMeshInstances::CpuBuilding(ref instances) => instances
-                .get(&entity)
-                .map(|instance| instance.mesh_asset_id),
-            RenderMeshInstances::GpuBuilding(ref instances) => instances
-                .get(&entity)
-                .map(|instance| instance.mesh_asset_id),
+            RenderMeshInstances::CpuBuilding(ref instances) => instances.mesh_asset_id(entity),
+            RenderMeshInstances::GpuBuilding(ref instances) => instances.mesh_asset_id(entity),
         }
     }
 
     /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
     /// mesh attached.
-    pub fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+    /*pub fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
         match *self {
             RenderMeshInstances::CpuBuilding(ref instances) => {
-                instances.get(&entity).map(|instance| RenderMeshQueueData {
-                    shared: &instance.shared,
-                    translation: instance.transforms.transform.translation,
-                })
+                instances.render_mesh_queue_data(entity)
             }
             RenderMeshInstances::GpuBuilding(ref instances) => {
-                instances.get(&entity).map(|instance| RenderMeshQueueData {
-                    shared: &instance.shared,
-                    translation: instance.translation,
-                })
+                instances.render_mesh_queue_data(entity)
             }
         }
+    }*/
+}
+
+pub(crate) trait RenderMeshInstancesTable {
+    /// Returns the ID of the mesh asset attached to the given entity, if any.
+    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>>;
+
+    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
+    /// mesh attached.
+    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData>;
+}
+
+impl RenderMeshInstancesTable for RenderMeshInstancesCpu {
+    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
+        self.get(&entity).map(|instance| instance.mesh_asset_id)
+    }
+
+    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+        self.get(&entity).map(|instance| RenderMeshQueueData {
+            shared: &instance.shared,
+            translation: instance.transforms.transform.translation,
+        })
+    }
+}
+
+impl RenderMeshInstancesTable for RenderMeshInstancesGpu {
+    /// Returns the ID of the mesh asset attached to the given entity, if any.
+    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
+        self.get(&entity).map(|instance| instance.mesh_asset_id)
+    }
+
+    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
+    /// mesh attached.
+    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+        self.get(&entity).map(|instance| RenderMeshQueueData {
+            shared: &instance.shared,
+            translation: instance.translation,
+        })
     }
 }
 
@@ -628,7 +667,7 @@ pub fn extract_meshes_for_gpu_building(
         gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
     >,
     mut render_mesh_instance_queues: Local<Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>>,
-    mut prev_render_mesh_instances: Local<EntityHashMap<RenderMeshInstanceGpu>>,
+    mut prev_render_mesh_instances: Local<RenderMeshInstancesGpu>,
     meshes_query: Extract<
         Query<(
             Entity,
@@ -705,7 +744,7 @@ fn collect_meshes_for_gpu_building(
         MeshInputUniform,
     >,
     render_mesh_instance_queues: &mut Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>,
-    prev_render_mesh_instances: &mut EntityHashMap<RenderMeshInstanceGpu>,
+    prev_render_mesh_instances: &mut RenderMeshInstancesGpu,
 ) {
     // Collect render mesh instances. Build up the uniform buffer.
     let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index e3614eec0bc4f..3234a1ce6979e 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -23,7 +23,7 @@ use crate::{
     view::ViewTarget,
 };
 
-use super::{BatchMeta, GetFullBatchData};
+use super::GetFullBatchData;
 
 /// The GPU buffers holding the data needed to render batches.
 ///
@@ -159,28 +159,6 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
 {
     let system_param_item = param.into_inner();
 
-    let process_item =
-        |item: &mut I,
-         data_buffer: &mut UninitBufferVec<GFBD::BufferData>,
-         work_item_buffer: &mut BufferVec<PreprocessWorkItem>| {
-            let (input_index, compare_data) =
-                GFBD::get_batch_input_index(&system_param_item, item.entity())?;
-            let output_index = data_buffer.add() as u32;
-
-            work_item_buffer.push(PreprocessWorkItem {
-                input_index,
-                output_index,
-            });
-
-            *item.batch_range_mut() = output_index..output_index + 1;
-
-            if I::AUTOMATIC_BATCHING {
-                compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
-            } else {
-                None
-            }
-        };
-
     // We only process GPU-built batch data in this function.
     let BatchedInstanceBuffers {
         ref mut data_buffer,
@@ -189,23 +167,24 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
     } = gpu_batched_instance_buffers.into_inner();
 
     for (view, mut phase) in &mut views {
-        // Create the work item buffer if necessary; otherwise, just mark it as
-        // used this frame.
+        // Create the work item buffer if necessary.
         let work_item_buffer = work_item_buffers
             .entry(view)
             .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
 
-        let items = phase.items.iter_mut().map(|item| {
-            let batch_data = process_item(item, data_buffer, work_item_buffer);
-            (item.batch_range_mut(), batch_data)
-        });
-        items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
-            if batch_meta.is_some() && prev_batch_meta == batch_meta {
-                start_range.end = range.end;
-                (start_range, prev_batch_meta)
-            } else {
-                (range, batch_meta)
-            }
+        super::batch_and_prepare_sorted_render_phase::<I, GFBD>(&mut phase, |item| {
+            let (input_index, compare_data) =
+                GFBD::get_batch_input_index(&system_param_item, item.entity())?;
+            let output_index = data_buffer.add() as u32;
+
+            work_item_buffer.push(PreprocessWorkItem {
+                input_index,
+                output_index,
+            });
+
+            *item.batch_range_mut() = output_index..output_index + 1;
+
+            compare_data
         });
     }
 }
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index b494faf049aa8..29dbb33f202f1 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -9,6 +9,7 @@ use nonmax::NonMaxU32;
 use crate::{
     render_phase::{
         BinnedPhaseItem, BinnedRenderPhase, CachedRenderPipelinePhaseItem, DrawFunctionId,
+        SortedPhaseItem, SortedRenderPhase,
     },
     render_resource::{CachedRenderPipelineId, GpuArrayBufferable},
 };
@@ -168,3 +169,36 @@ where
         phase.unbatchable_keys.sort_unstable();
     }
 }
+
+/// Batches the items in a sorted render phase.
+///
+/// This means comparing metadata needed to draw each phase item and trying to
+/// combine the draws into a batch.
+///
+/// This is common code factored out from
+/// [`gpu_preprocessing::batch_and_prepare_sorted_render_phase`] and
+/// [`no_gpu_preprocessing::batch_and_prepare_sorted_render_phase`].
+fn batch_and_prepare_sorted_render_phase<I, GBD>(
+    phase: &mut SortedRenderPhase<I>,
+    mut process_item: impl FnMut(&mut I) -> Option<GBD::CompareData>,
+) where
+    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    GBD: GetBatchData,
+{
+    let items = phase.items.iter_mut().map(|item| {
+        let batch_data = match process_item(item) {
+            Some(compare_data) if I::AUTOMATIC_BATCHING => Some(BatchMeta::new(item, compare_data)),
+            _ => None,
+        };
+        (item.batch_range_mut(), batch_data)
+    });
+
+    items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
+        if batch_meta.is_some() && prev_batch_meta == batch_meta {
+            start_range.end = range.end;
+            (start_range, prev_batch_meta)
+        } else {
+            (range, batch_meta)
+        }
+    });
+}
diff --git a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
index ca81eb6698b80..429fe5bb4542e 100644
--- a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
@@ -14,7 +14,7 @@ use crate::{
     renderer::{RenderDevice, RenderQueue},
 };
 
-use super::{BatchMeta, GetBatchData, GetFullBatchData};
+use super::{GetBatchData, GetFullBatchData};
 
 /// The GPU buffers holding the data needed to render batches.
 ///
@@ -56,36 +56,20 @@ pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
 {
     let system_param_item = param.into_inner();
 
-    let process_item = |item: &mut I, buffer: &mut GpuArrayBuffer<GBD::BufferData>| {
-        let (buffer_data, compare_data) = GBD::get_batch_data(&system_param_item, item.entity())?;
-        let buffer_index = buffer.push(buffer_data);
-
-        let index = buffer_index.index;
-        *item.batch_range_mut() = index..index + 1;
-        *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
-
-        if I::AUTOMATIC_BATCHING {
-            compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
-        } else {
-            None
-        }
-    };
-
     // We only process CPU-built batch data in this function.
     let batched_instance_buffer = batched_instance_buffer.into_inner();
 
     for mut phase in &mut views {
-        let items = phase.items.iter_mut().map(|item| {
-            let batch_data = process_item(item, batched_instance_buffer);
-            (item.batch_range_mut(), batch_data)
-        });
-        items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
-            if batch_meta.is_some() && prev_batch_meta == batch_meta {
-                start_range.end = range.end;
-                (start_range, prev_batch_meta)
-            } else {
-                (range, batch_meta)
-            }
+        super::batch_and_prepare_sorted_render_phase::<I, GBD>(&mut phase, |item| {
+            let (buffer_data, compare_data) =
+                GBD::get_batch_data(&system_param_item, item.entity())?;
+            let buffer_index = batched_instance_buffer.push(buffer_data);
+
+            let index = buffer_index.index;
+            *item.batch_range_mut() = index..index + 1;
+            *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
+
+            compare_data
         });
     }
 }

From cb087c12ae6179e9faace9d89fd7771ae121c3fc Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sun, 7 Apr 2024 17:05:53 -0700
Subject: [PATCH 30/39] Factor out the enum switches.

---
 crates/bevy_pbr/src/material.rs     | 97 ++++++++++++++++++++++++++++-
 crates/bevy_pbr/src/prepass/mod.rs  | 82 +++++++++++++++++++++++-
 crates/bevy_pbr/src/render/light.rs | 73 +++++++++++++++++++++-
 crates/bevy_pbr/src/render/mesh.rs  |  8 +--
 4 files changed, 252 insertions(+), 8 deletions(-)

diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index 698a6ae10d870..bff8587101ad5 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -512,14 +512,16 @@ pub const fn screen_space_specular_transmission_pipeline_key(
     }
 }
 
+/// For each view, iterates over all the meshes visible from that view and adds
+/// them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as appropriate.
 #[allow(clippy::too_many_arguments)]
-pub fn queue_material_meshes<M: Material>(
+pub fn queue_material_meshes<M>(
     opaque_draw_functions: Res<DrawFunctions<Opaque3d>>,
     alpha_mask_draw_functions: Res<DrawFunctions<AlphaMask3d>>,
     transmissive_draw_functions: Res<DrawFunctions<Transmissive3d>>,
     transparent_draw_functions: Res<DrawFunctions<Transparent3d>>,
     material_pipeline: Res<MaterialPipeline<M>>,
-    mut pipelines: ResMut<SpecializedMeshPipelines<MaterialPipeline<M>>>,
+    pipelines: ResMut<SpecializedMeshPipelines<MaterialPipeline<M>>>,
     pipeline_cache: Res<PipelineCache>,
     msaa: Res<Msaa>,
     render_meshes: Res<RenderAssets<Mesh>>,
@@ -527,6 +529,95 @@ pub fn queue_material_meshes<M: Material>(
     render_mesh_instances: Res<RenderMeshInstances>,
     render_material_instances: Res<RenderMaterialInstances<M>>,
     render_lightmaps: Res<RenderLightmaps>,
+    views: Query<(
+        &ExtractedView,
+        &VisibleEntities,
+        Option<&Tonemapping>,
+        Option<&DebandDither>,
+        Option<&ShadowFilteringMethod>,
+        Has<ScreenSpaceAmbientOcclusionSettings>,
+        (
+            Has<NormalPrepass>,
+            Has<DepthPrepass>,
+            Has<MotionVectorPrepass>,
+            Has<DeferredPrepass>,
+        ),
+        Option<&Camera3d>,
+        Has<TemporalJitter>,
+        Option<&Projection>,
+        &mut BinnedRenderPhase<Opaque3d>,
+        &mut BinnedRenderPhase<AlphaMask3d>,
+        &mut SortedRenderPhase<Transmissive3d>,
+        &mut SortedRenderPhase<Transparent3d>,
+        (
+            Has<RenderViewLightProbes<EnvironmentMapLight>>,
+            Has<RenderViewLightProbes<IrradianceVolume>>,
+        ),
+    )>,
+) where
+    M: Material,
+    M::Data: PartialEq + Eq + Hash + Clone,
+{
+    match *render_mesh_instances {
+        RenderMeshInstances::CpuBuilding(ref render_mesh_instances) => {
+            queue_material_meshes_with_render_mesh_instances(
+                opaque_draw_functions,
+                alpha_mask_draw_functions,
+                transmissive_draw_functions,
+                transparent_draw_functions,
+                material_pipeline,
+                pipelines,
+                pipeline_cache,
+                msaa,
+                render_meshes,
+                render_materials,
+                render_mesh_instances,
+                render_material_instances,
+                render_lightmaps,
+                views,
+            );
+        }
+        RenderMeshInstances::GpuBuilding(ref render_mesh_instances) => {
+            queue_material_meshes_with_render_mesh_instances(
+                opaque_draw_functions,
+                alpha_mask_draw_functions,
+                transmissive_draw_functions,
+                transparent_draw_functions,
+                material_pipeline,
+                pipelines,
+                pipeline_cache,
+                msaa,
+                render_meshes,
+                render_materials,
+                render_mesh_instances,
+                render_material_instances,
+                render_lightmaps,
+                views,
+            );
+        }
+    }
+}
+
+/// For each view, iterates over all the meshes visible from that view and adds
+/// them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as appropriate.
+///
+/// This is a helper function. We dispatch to it in order to avoid branching on
+/// the variant of [`RenderMeshInstances`] in a hot loop.
+#[allow(clippy::too_many_arguments)]
+fn queue_material_meshes_with_render_mesh_instances<M, RMIT>(
+    opaque_draw_functions: Res<DrawFunctions<Opaque3d>>,
+    alpha_mask_draw_functions: Res<DrawFunctions<AlphaMask3d>>,
+    transmissive_draw_functions: Res<DrawFunctions<Transmissive3d>>,
+    transparent_draw_functions: Res<DrawFunctions<Transparent3d>>,
+    material_pipeline: Res<MaterialPipeline<M>>,
+    mut pipelines: ResMut<SpecializedMeshPipelines<MaterialPipeline<M>>>,
+    pipeline_cache: Res<PipelineCache>,
+    msaa: Res<Msaa>,
+    render_meshes: Res<RenderAssets<Mesh>>,
+    render_materials: Res<RenderMaterials<M>>,
+    render_mesh_instances: &RMIT,
+    render_material_instances: Res<RenderMaterialInstances<M>>,
+    render_lightmaps: Res<RenderLightmaps>,
     mut views: Query<(
         &ExtractedView,
         &VisibleEntities,
@@ -553,7 +644,9 @@ pub fn queue_material_meshes<M: Material>(
         ),
     )>,
 ) where
+    M: Material,
     M::Data: PartialEq + Eq + Hash + Clone,
+    RMIT: RenderMeshInstancesTable,
 {
     for (
         view,
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index 30f2b78cb66cc..a8dfe0c428202 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -706,7 +706,7 @@ pub fn queue_prepass_material_meshes<M: Material>(
     opaque_deferred_draw_functions: Res<DrawFunctions<Opaque3dDeferred>>,
     alpha_mask_deferred_draw_functions: Res<DrawFunctions<AlphaMask3dDeferred>>,
     prepass_pipeline: Res<PrepassPipeline<M>>,
-    mut pipelines: ResMut<SpecializedMeshPipelines<PrepassPipeline<M>>>,
+    pipelines: ResMut<SpecializedMeshPipelines<PrepassPipeline<M>>>,
     pipeline_cache: Res<PipelineCache>,
     msaa: Res<Msaa>,
     render_meshes: Res<RenderAssets<Mesh>>,
@@ -714,6 +714,84 @@ pub fn queue_prepass_material_meshes<M: Material>(
     render_materials: Res<RenderMaterials<M>>,
     render_material_instances: Res<RenderMaterialInstances<M>>,
     render_lightmaps: Res<RenderLightmaps>,
+    views: Query<
+        (
+            &ExtractedView,
+            &VisibleEntities,
+            Option<&mut BinnedRenderPhase<Opaque3dPrepass>>,
+            Option<&mut BinnedRenderPhase<AlphaMask3dPrepass>>,
+            Option<&mut BinnedRenderPhase<Opaque3dDeferred>>,
+            Option<&mut BinnedRenderPhase<AlphaMask3dDeferred>>,
+            Option<&DepthPrepass>,
+            Option<&NormalPrepass>,
+            Option<&MotionVectorPrepass>,
+            Option<&DeferredPrepass>,
+        ),
+        Or<(
+            With<BinnedRenderPhase<Opaque3dPrepass>>,
+            With<BinnedRenderPhase<AlphaMask3dPrepass>>,
+            With<BinnedRenderPhase<Opaque3dDeferred>>,
+            With<BinnedRenderPhase<AlphaMask3dDeferred>>,
+        )>,
+    >,
+) where
+    M::Data: PartialEq + Eq + Hash + Clone,
+{
+    match *render_mesh_instances {
+        RenderMeshInstances::CpuBuilding(ref render_mesh_instances) => {
+            queue_prepass_material_meshes_with_render_mesh_instances(
+                opaque_draw_functions,
+                alpha_mask_draw_functions,
+                opaque_deferred_draw_functions,
+                alpha_mask_deferred_draw_functions,
+                prepass_pipeline,
+                pipelines,
+                pipeline_cache,
+                msaa,
+                render_meshes,
+                render_mesh_instances,
+                render_materials,
+                render_material_instances,
+                render_lightmaps,
+                views,
+            );
+        }
+        RenderMeshInstances::GpuBuilding(ref render_mesh_instances) => {
+            queue_prepass_material_meshes_with_render_mesh_instances(
+                opaque_draw_functions,
+                alpha_mask_draw_functions,
+                opaque_deferred_draw_functions,
+                alpha_mask_deferred_draw_functions,
+                prepass_pipeline,
+                pipelines,
+                pipeline_cache,
+                msaa,
+                render_meshes,
+                render_mesh_instances,
+                render_materials,
+                render_material_instances,
+                render_lightmaps,
+                views,
+            );
+        }
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+fn queue_prepass_material_meshes_with_render_mesh_instances<M, RMIT>(
+    opaque_draw_functions: Res<DrawFunctions<Opaque3dPrepass>>,
+    alpha_mask_draw_functions: Res<DrawFunctions<AlphaMask3dPrepass>>,
+    opaque_deferred_draw_functions: Res<DrawFunctions<Opaque3dDeferred>>,
+    alpha_mask_deferred_draw_functions: Res<DrawFunctions<AlphaMask3dDeferred>>,
+    prepass_pipeline: Res<PrepassPipeline<M>>,
+    mut pipelines: ResMut<SpecializedMeshPipelines<PrepassPipeline<M>>>,
+    pipeline_cache: Res<PipelineCache>,
+    msaa: Res<Msaa>,
+    render_meshes: Res<RenderAssets<Mesh>>,
+    render_mesh_instances: &RMIT,
+    render_materials: Res<RenderMaterials<M>>,
+    render_material_instances: Res<RenderMaterialInstances<M>>,
+    render_lightmaps: Res<RenderLightmaps>,
     mut views: Query<
         (
             &ExtractedView,
@@ -735,7 +813,9 @@ pub fn queue_prepass_material_meshes<M: Material>(
         )>,
     >,
 ) where
+    M: Material,
     M::Data: PartialEq + Eq + Hash + Clone,
+    RMIT: RenderMeshInstancesTable,
 {
     let opaque_draw_prepass = opaque_draw_functions
         .read()
diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs
index 4697af948bfcc..94341ced0a56b 100644
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@@ -1594,14 +1594,83 @@ pub fn prepare_clusters(
     }
 }
 
+/// For each shadow cascade, iterates over all the meshes "visible" from it and
+/// adds them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as
+/// appropriate.
 #[allow(clippy::too_many_arguments)]
-pub fn queue_shadows<M: Material>(
+pub fn queue_shadows<M>(
     shadow_draw_functions: Res<DrawFunctions<Shadow>>,
     prepass_pipeline: Res<PrepassPipeline<M>>,
     render_meshes: Res<RenderAssets<Mesh>>,
     render_mesh_instances: Res<RenderMeshInstances>,
     render_materials: Res<RenderMaterials<M>>,
     render_material_instances: Res<RenderMaterialInstances<M>>,
+    pipelines: ResMut<SpecializedMeshPipelines<PrepassPipeline<M>>>,
+    pipeline_cache: Res<PipelineCache>,
+    render_lightmaps: Res<RenderLightmaps>,
+    view_lights: Query<(Entity, &ViewLightEntities)>,
+    view_light_shadow_phases: Query<(&LightEntity, &mut BinnedRenderPhase<Shadow>)>,
+    point_light_entities: Query<&CubemapVisibleEntities, With<ExtractedPointLight>>,
+    directional_light_entities: Query<&CascadesVisibleEntities, With<ExtractedDirectionalLight>>,
+    spot_light_entities: Query<&VisibleEntities, With<ExtractedPointLight>>,
+) where
+    M: Material,
+    M::Data: PartialEq + Eq + Hash + Clone,
+{
+    match *render_mesh_instances {
+        RenderMeshInstances::CpuBuilding(ref render_mesh_instances) => {
+            queue_shadows_with_render_mesh_instances(
+                shadow_draw_functions,
+                prepass_pipeline,
+                render_meshes,
+                render_mesh_instances,
+                render_materials,
+                render_material_instances,
+                pipelines,
+                pipeline_cache,
+                render_lightmaps,
+                view_lights,
+                view_light_shadow_phases,
+                point_light_entities,
+                directional_light_entities,
+                spot_light_entities,
+            );
+        }
+        RenderMeshInstances::GpuBuilding(ref render_mesh_instances) => {
+            queue_shadows_with_render_mesh_instances(
+                shadow_draw_functions,
+                prepass_pipeline,
+                render_meshes,
+                render_mesh_instances,
+                render_materials,
+                render_material_instances,
+                pipelines,
+                pipeline_cache,
+                render_lightmaps,
+                view_lights,
+                view_light_shadow_phases,
+                point_light_entities,
+                directional_light_entities,
+                spot_light_entities,
+            );
+        }
+    }
+}
+
+/// For each shadow cascade, iterates over all the meshes "visible" from it and
+/// adds them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as
+/// appropriate.
+///
+/// This is a helper function. We dispatch to it in order to avoid branching on
+/// the variant of [`RenderMeshInstances`] in a hot loop.
+#[allow(clippy::too_many_arguments)]
+fn queue_shadows_with_render_mesh_instances<M, RMIT>(
+    shadow_draw_functions: Res<DrawFunctions<Shadow>>,
+    prepass_pipeline: Res<PrepassPipeline<M>>,
+    render_meshes: Res<RenderAssets<Mesh>>,
+    render_mesh_instances: &RMIT,
+    render_materials: Res<RenderMaterials<M>>,
+    render_material_instances: Res<RenderMaterialInstances<M>>,
     mut pipelines: ResMut<SpecializedMeshPipelines<PrepassPipeline<M>>>,
     pipeline_cache: Res<PipelineCache>,
     render_lightmaps: Res<RenderLightmaps>,
@@ -1611,7 +1680,9 @@ pub fn queue_shadows<M: Material>(
     directional_light_entities: Query<&CascadesVisibleEntities, With<ExtractedDirectionalLight>>,
     spot_light_entities: Query<&VisibleEntities, With<ExtractedPointLight>>,
 ) where
+    M: Material,
     M::Data: PartialEq + Eq + Hash + Clone,
+    RMIT: RenderMeshInstancesTable,
 {
     for (entity, view_lights) in &view_lights {
         let draw_shadow_mesh = shadow_draw_functions.read().id::<DrawPrepass<M>>();
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 21f9b39fc0efc..af395a274618d 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -480,12 +480,12 @@ pub enum RenderMeshInstances {
 /// Information that the render world keeps about each entity that contains a
 /// mesh, when using CPU mesh instance data building.
 #[derive(Default, Deref, DerefMut)]
-struct RenderMeshInstancesCpu(EntityHashMap<RenderMeshInstanceCpu>);
+pub struct RenderMeshInstancesCpu(EntityHashMap<RenderMeshInstanceCpu>);
 
 /// Information that the render world keeps about each entity that contains a
 /// mesh, when using GPU mesh instance data building.
 #[derive(Default, Deref, DerefMut)]
-struct RenderMeshInstancesGpu(EntityHashMap<RenderMeshInstanceGpu>);
+pub struct RenderMeshInstancesGpu(EntityHashMap<RenderMeshInstanceGpu>);
 
 impl RenderMeshInstances {
     /// Creates a new [`RenderMeshInstances`] instance.
@@ -507,7 +507,7 @@ impl RenderMeshInstances {
 
     /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
     /// mesh attached.
-    /*pub fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+    pub fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
         match *self {
             RenderMeshInstances::CpuBuilding(ref instances) => {
                 instances.render_mesh_queue_data(entity)
@@ -516,7 +516,7 @@ impl RenderMeshInstances {
                 instances.render_mesh_queue_data(entity)
             }
         }
-    }*/
+    }
 }
 
 pub(crate) trait RenderMeshInstancesTable {

From 10dc0e489a69cf655a13036a9f6010559e9c6ef9 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Sun, 7 Apr 2024 17:26:19 -0700
Subject: [PATCH 31/39] Rename `get_batch_input_index` to
 `get_index_of_batch_input`.

---
 crates/bevy_pbr/src/render/mesh.rs                   |  9 +++++----
 crates/bevy_render/src/batching/gpu_preprocessing.rs |  6 +++---
 crates/bevy_render/src/batching/mod.rs               | 10 +++++-----
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index af395a274618d..8470a6efce92d 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -958,14 +958,15 @@ impl GetBatchData for MeshPipeline {
 impl GetFullBatchData for MeshPipeline {
     type BufferInputData = MeshInputUniform;
 
-    fn get_batch_input_index(
+    fn get_index_of_batch_input(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(u32, Option<Self::CompareData>)> {
         // This should only be called during GPU building.
         let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
             error!(
-                "`get_batch_input_index` should never be called in CPU mesh uniform building mode"
+                "`get_index_of_batch_input` should never be called in CPU mesh uniform building \
+                mode"
             );
             return None;
         };
@@ -1002,14 +1003,14 @@ impl GetFullBatchData for MeshPipeline {
         ))
     }
 
-    fn get_binned_batch_input_index(
+    fn get_index_of_binned_batch_input(
         (mesh_instances, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<u32> {
         // This should only be called during GPU building.
         let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
             error!(
-                "`get_binned_batch_input_index` should never be called in CPU mesh uniform \
+                "`get_index_of_binned_batch_input` should never be called in CPU mesh uniform \
                 building mode"
             );
             return None;
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 3234a1ce6979e..4334494fcb08b 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -174,7 +174,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
 
         super::batch_and_prepare_sorted_render_phase::<I, GFBD>(&mut phase, |item| {
             let (input_index, compare_data) =
-                GFBD::get_batch_input_index(&system_param_item, item.entity())?;
+                GFBD::get_index_of_batch_input(&system_param_item, item.entity())?;
             let output_index = data_buffer.add() as u32;
 
             work_item_buffer.push(PreprocessWorkItem {
@@ -223,7 +223,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
             let mut batch: Option<BinnedRenderPhaseBatch> = None;
             for &entity in &phase.batchable_values[key] {
                 let Some(input_index) =
-                    GFBD::get_binned_batch_input_index(&system_param_item, entity)
+                    GFBD::get_index_of_binned_batch_input(&system_param_item, entity)
                 else {
                     continue;
                 };
@@ -254,7 +254,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
             let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
             for &entity in &unbatchables.entities {
                 let Some(input_index) =
-                    GFBD::get_binned_batch_input_index(&system_param_item, entity)
+                    GFBD::get_index_of_binned_batch_input(&system_param_item, entity)
                 else {
                     continue;
                 };
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 29dbb33f202f1..e461fee6bf6cc 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -83,7 +83,7 @@ pub trait GetBatchData {
     ///
     /// This is only called when building instance data on CPU. In the GPU
     /// instance data building path, we use
-    /// [`GetFullBatchData::get_batch_input_index`] instead.
+    /// [`GetFullBatchData::get_index_of_batch_input`] instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
@@ -103,8 +103,8 @@ pub trait GetFullBatchData: GetBatchData {
     /// [`crate::render_resource::GpuArrayBuffer`].
     ///
     /// This is only called when building uniforms on CPU. In the GPU instance
-    /// buffer building path, we use [`GetFullBatchData::get_batch_input_index`]
-    /// instead.
+    /// buffer building path, we use
+    /// [`GetFullBatchData::get_index_of_batch_input`] instead.
     fn get_binned_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
@@ -117,7 +117,7 @@ pub trait GetFullBatchData: GetBatchData {
     /// extraction phase before we got here, so this function shouldn't need to
     /// look up any render data. If CPU instance buffer building is in use, this
     /// function will never be called.
-    fn get_batch_input_index(
+    fn get_index_of_batch_input(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(u32, Option<Self::CompareData>)>;
@@ -129,7 +129,7 @@ pub trait GetFullBatchData: GetBatchData {
     /// extraction phase before we got here, so this function shouldn't need to
     /// look up any render data. If CPU instance buffer building is in use, this
     /// function will never be called.
-    fn get_binned_batch_input_index(
+    fn get_index_of_binned_batch_input(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<u32>;

From 60138b77186264c911b9a54687d96eda7476adc0 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 8 Apr 2024 01:19:51 -0700
Subject: [PATCH 32/39] Revert the change to split out `queue_material_meshes`,
 etc.

I didn't see any evidence that this improved performance, and it hurts
maintainability.
---
 crates/bevy_pbr/src/material.rs     | 95 +----------------------------
 crates/bevy_pbr/src/prepass/mod.rs  | 82 +------------------------
 crates/bevy_pbr/src/render/light.rs | 70 +--------------------
 3 files changed, 4 insertions(+), 243 deletions(-)

diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index bff8587101ad5..0be450820e9f3 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -515,96 +515,7 @@ pub const fn screen_space_specular_transmission_pipeline_key(
 /// For each view, iterates over all the meshes visible from that view and adds
 /// them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as appropriate.
 #[allow(clippy::too_many_arguments)]
-pub fn queue_material_meshes<M>(
-    opaque_draw_functions: Res<DrawFunctions<Opaque3d>>,
-    alpha_mask_draw_functions: Res<DrawFunctions<AlphaMask3d>>,
-    transmissive_draw_functions: Res<DrawFunctions<Transmissive3d>>,
-    transparent_draw_functions: Res<DrawFunctions<Transparent3d>>,
-    material_pipeline: Res<MaterialPipeline<M>>,
-    pipelines: ResMut<SpecializedMeshPipelines<MaterialPipeline<M>>>,
-    pipeline_cache: Res<PipelineCache>,
-    msaa: Res<Msaa>,
-    render_meshes: Res<RenderAssets<Mesh>>,
-    render_materials: Res<RenderMaterials<M>>,
-    render_mesh_instances: Res<RenderMeshInstances>,
-    render_material_instances: Res<RenderMaterialInstances<M>>,
-    render_lightmaps: Res<RenderLightmaps>,
-    views: Query<(
-        &ExtractedView,
-        &VisibleEntities,
-        Option<&Tonemapping>,
-        Option<&DebandDither>,
-        Option<&ShadowFilteringMethod>,
-        Has<ScreenSpaceAmbientOcclusionSettings>,
-        (
-            Has<NormalPrepass>,
-            Has<DepthPrepass>,
-            Has<MotionVectorPrepass>,
-            Has<DeferredPrepass>,
-        ),
-        Option<&Camera3d>,
-        Has<TemporalJitter>,
-        Option<&Projection>,
-        &mut BinnedRenderPhase<Opaque3d>,
-        &mut BinnedRenderPhase<AlphaMask3d>,
-        &mut SortedRenderPhase<Transmissive3d>,
-        &mut SortedRenderPhase<Transparent3d>,
-        (
-            Has<RenderViewLightProbes<EnvironmentMapLight>>,
-            Has<RenderViewLightProbes<IrradianceVolume>>,
-        ),
-    )>,
-) where
-    M: Material,
-    M::Data: PartialEq + Eq + Hash + Clone,
-{
-    match *render_mesh_instances {
-        RenderMeshInstances::CpuBuilding(ref render_mesh_instances) => {
-            queue_material_meshes_with_render_mesh_instances(
-                opaque_draw_functions,
-                alpha_mask_draw_functions,
-                transmissive_draw_functions,
-                transparent_draw_functions,
-                material_pipeline,
-                pipelines,
-                pipeline_cache,
-                msaa,
-                render_meshes,
-                render_materials,
-                render_mesh_instances,
-                render_material_instances,
-                render_lightmaps,
-                views,
-            );
-        }
-        RenderMeshInstances::GpuBuilding(ref render_mesh_instances) => {
-            queue_material_meshes_with_render_mesh_instances(
-                opaque_draw_functions,
-                alpha_mask_draw_functions,
-                transmissive_draw_functions,
-                transparent_draw_functions,
-                material_pipeline,
-                pipelines,
-                pipeline_cache,
-                msaa,
-                render_meshes,
-                render_materials,
-                render_mesh_instances,
-                render_material_instances,
-                render_lightmaps,
-                views,
-            );
-        }
-    }
-}
-
-/// For each view, iterates over all the meshes visible from that view and adds
-/// them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as appropriate.
-///
-/// This is a helper function. We dispatch to it in order to avoid branching on
-/// the variant of [`RenderMeshInstances`] in a hot loop.
-#[allow(clippy::too_many_arguments)]
-fn queue_material_meshes_with_render_mesh_instances<M, RMIT>(
+pub fn queue_material_meshes<M: Material>(
     opaque_draw_functions: Res<DrawFunctions<Opaque3d>>,
     alpha_mask_draw_functions: Res<DrawFunctions<AlphaMask3d>>,
     transmissive_draw_functions: Res<DrawFunctions<Transmissive3d>>,
@@ -615,7 +526,7 @@ fn queue_material_meshes_with_render_mesh_instances<M, RMIT>(
     msaa: Res<Msaa>,
     render_meshes: Res<RenderAssets<Mesh>>,
     render_materials: Res<RenderMaterials<M>>,
-    render_mesh_instances: &RMIT,
+    render_mesh_instances: Res<RenderMeshInstances>,
     render_material_instances: Res<RenderMaterialInstances<M>>,
     render_lightmaps: Res<RenderLightmaps>,
     mut views: Query<(
@@ -644,9 +555,7 @@ fn queue_material_meshes_with_render_mesh_instances<M, RMIT>(
         ),
     )>,
 ) where
-    M: Material,
     M::Data: PartialEq + Eq + Hash + Clone,
-    RMIT: RenderMeshInstancesTable,
 {
     for (
         view,
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index a8dfe0c428202..30f2b78cb66cc 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -701,84 +701,6 @@ pub fn prepare_prepass_view_bind_group<M: Material>(
 
 #[allow(clippy::too_many_arguments)]
 pub fn queue_prepass_material_meshes<M: Material>(
-    opaque_draw_functions: Res<DrawFunctions<Opaque3dPrepass>>,
-    alpha_mask_draw_functions: Res<DrawFunctions<AlphaMask3dPrepass>>,
-    opaque_deferred_draw_functions: Res<DrawFunctions<Opaque3dDeferred>>,
-    alpha_mask_deferred_draw_functions: Res<DrawFunctions<AlphaMask3dDeferred>>,
-    prepass_pipeline: Res<PrepassPipeline<M>>,
-    pipelines: ResMut<SpecializedMeshPipelines<PrepassPipeline<M>>>,
-    pipeline_cache: Res<PipelineCache>,
-    msaa: Res<Msaa>,
-    render_meshes: Res<RenderAssets<Mesh>>,
-    render_mesh_instances: Res<RenderMeshInstances>,
-    render_materials: Res<RenderMaterials<M>>,
-    render_material_instances: Res<RenderMaterialInstances<M>>,
-    render_lightmaps: Res<RenderLightmaps>,
-    views: Query<
-        (
-            &ExtractedView,
-            &VisibleEntities,
-            Option<&mut BinnedRenderPhase<Opaque3dPrepass>>,
-            Option<&mut BinnedRenderPhase<AlphaMask3dPrepass>>,
-            Option<&mut BinnedRenderPhase<Opaque3dDeferred>>,
-            Option<&mut BinnedRenderPhase<AlphaMask3dDeferred>>,
-            Option<&DepthPrepass>,
-            Option<&NormalPrepass>,
-            Option<&MotionVectorPrepass>,
-            Option<&DeferredPrepass>,
-        ),
-        Or<(
-            With<BinnedRenderPhase<Opaque3dPrepass>>,
-            With<BinnedRenderPhase<AlphaMask3dPrepass>>,
-            With<BinnedRenderPhase<Opaque3dDeferred>>,
-            With<BinnedRenderPhase<AlphaMask3dDeferred>>,
-        )>,
-    >,
-) where
-    M::Data: PartialEq + Eq + Hash + Clone,
-{
-    match *render_mesh_instances {
-        RenderMeshInstances::CpuBuilding(ref render_mesh_instances) => {
-            queue_prepass_material_meshes_with_render_mesh_instances(
-                opaque_draw_functions,
-                alpha_mask_draw_functions,
-                opaque_deferred_draw_functions,
-                alpha_mask_deferred_draw_functions,
-                prepass_pipeline,
-                pipelines,
-                pipeline_cache,
-                msaa,
-                render_meshes,
-                render_mesh_instances,
-                render_materials,
-                render_material_instances,
-                render_lightmaps,
-                views,
-            );
-        }
-        RenderMeshInstances::GpuBuilding(ref render_mesh_instances) => {
-            queue_prepass_material_meshes_with_render_mesh_instances(
-                opaque_draw_functions,
-                alpha_mask_draw_functions,
-                opaque_deferred_draw_functions,
-                alpha_mask_deferred_draw_functions,
-                prepass_pipeline,
-                pipelines,
-                pipeline_cache,
-                msaa,
-                render_meshes,
-                render_mesh_instances,
-                render_materials,
-                render_material_instances,
-                render_lightmaps,
-                views,
-            );
-        }
-    }
-}
-
-#[allow(clippy::too_many_arguments)]
-fn queue_prepass_material_meshes_with_render_mesh_instances<M, RMIT>(
     opaque_draw_functions: Res<DrawFunctions<Opaque3dPrepass>>,
     alpha_mask_draw_functions: Res<DrawFunctions<AlphaMask3dPrepass>>,
     opaque_deferred_draw_functions: Res<DrawFunctions<Opaque3dDeferred>>,
@@ -788,7 +710,7 @@ fn queue_prepass_material_meshes_with_render_mesh_instances<M, RMIT>(
     pipeline_cache: Res<PipelineCache>,
     msaa: Res<Msaa>,
     render_meshes: Res<RenderAssets<Mesh>>,
-    render_mesh_instances: &RMIT,
+    render_mesh_instances: Res<RenderMeshInstances>,
     render_materials: Res<RenderMaterials<M>>,
     render_material_instances: Res<RenderMaterialInstances<M>>,
     render_lightmaps: Res<RenderLightmaps>,
@@ -813,9 +735,7 @@ fn queue_prepass_material_meshes_with_render_mesh_instances<M, RMIT>(
         )>,
     >,
 ) where
-    M: Material,
     M::Data: PartialEq + Eq + Hash + Clone,
-    RMIT: RenderMeshInstancesTable,
 {
     let opaque_draw_prepass = opaque_draw_functions
         .read()
diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs
index 94341ced0a56b..9fa2b28afc7cf 100644
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@@ -1598,79 +1598,13 @@ pub fn prepare_clusters(
 /// adds them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as
 /// appropriate.
 #[allow(clippy::too_many_arguments)]
-pub fn queue_shadows<M>(
+pub fn queue_shadows<M: Material>(
     shadow_draw_functions: Res<DrawFunctions<Shadow>>,
     prepass_pipeline: Res<PrepassPipeline<M>>,
     render_meshes: Res<RenderAssets<Mesh>>,
     render_mesh_instances: Res<RenderMeshInstances>,
     render_materials: Res<RenderMaterials<M>>,
     render_material_instances: Res<RenderMaterialInstances<M>>,
-    pipelines: ResMut<SpecializedMeshPipelines<PrepassPipeline<M>>>,
-    pipeline_cache: Res<PipelineCache>,
-    render_lightmaps: Res<RenderLightmaps>,
-    view_lights: Query<(Entity, &ViewLightEntities)>,
-    view_light_shadow_phases: Query<(&LightEntity, &mut BinnedRenderPhase<Shadow>)>,
-    point_light_entities: Query<&CubemapVisibleEntities, With<ExtractedPointLight>>,
-    directional_light_entities: Query<&CascadesVisibleEntities, With<ExtractedDirectionalLight>>,
-    spot_light_entities: Query<&VisibleEntities, With<ExtractedPointLight>>,
-) where
-    M: Material,
-    M::Data: PartialEq + Eq + Hash + Clone,
-{
-    match *render_mesh_instances {
-        RenderMeshInstances::CpuBuilding(ref render_mesh_instances) => {
-            queue_shadows_with_render_mesh_instances(
-                shadow_draw_functions,
-                prepass_pipeline,
-                render_meshes,
-                render_mesh_instances,
-                render_materials,
-                render_material_instances,
-                pipelines,
-                pipeline_cache,
-                render_lightmaps,
-                view_lights,
-                view_light_shadow_phases,
-                point_light_entities,
-                directional_light_entities,
-                spot_light_entities,
-            );
-        }
-        RenderMeshInstances::GpuBuilding(ref render_mesh_instances) => {
-            queue_shadows_with_render_mesh_instances(
-                shadow_draw_functions,
-                prepass_pipeline,
-                render_meshes,
-                render_mesh_instances,
-                render_materials,
-                render_material_instances,
-                pipelines,
-                pipeline_cache,
-                render_lightmaps,
-                view_lights,
-                view_light_shadow_phases,
-                point_light_entities,
-                directional_light_entities,
-                spot_light_entities,
-            );
-        }
-    }
-}
-
-/// For each shadow cascade, iterates over all the meshes "visible" from it and
-/// adds them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as
-/// appropriate.
-///
-/// This is a helper function. We dispatch to it in order to avoid branching on
-/// the variant of [`RenderMeshInstances`] in a hot loop.
-#[allow(clippy::too_many_arguments)]
-fn queue_shadows_with_render_mesh_instances<M, RMIT>(
-    shadow_draw_functions: Res<DrawFunctions<Shadow>>,
-    prepass_pipeline: Res<PrepassPipeline<M>>,
-    render_meshes: Res<RenderAssets<Mesh>>,
-    render_mesh_instances: &RMIT,
-    render_materials: Res<RenderMaterials<M>>,
-    render_material_instances: Res<RenderMaterialInstances<M>>,
     mut pipelines: ResMut<SpecializedMeshPipelines<PrepassPipeline<M>>>,
     pipeline_cache: Res<PipelineCache>,
     render_lightmaps: Res<RenderLightmaps>,
@@ -1680,9 +1614,7 @@ fn queue_shadows_with_render_mesh_instances<M, RMIT>(
     directional_light_entities: Query<&CascadesVisibleEntities, With<ExtractedDirectionalLight>>,
     spot_light_entities: Query<&VisibleEntities, With<ExtractedPointLight>>,
 ) where
-    M: Material,
     M::Data: PartialEq + Eq + Hash + Clone,
-    RMIT: RenderMeshInstancesTable,
 {
     for (entity, view_lights) in &view_lights {
         let draw_shadow_mesh = shadow_draw_functions.read().id::<DrawPrepass<M>>();

From aa7694cf0fb5df5aa829cd9a5cd0cf8910f29761 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 8 Apr 2024 01:52:27 -0700
Subject: [PATCH 33/39] Fix `WGPU_SETTINGS_PRIO="webgl2"` by doing proper GPU
 limits checks, and fix the wasm port by making the compute pipeline optional
 in `DrawMesh`.

---
 crates/bevy_pbr/src/lib.rs                    | 19 ++---
 crates/bevy_pbr/src/render/gpu_preprocess.rs  | 48 +++++++-----
 crates/bevy_pbr/src/render/mesh.rs            | 78 +++++++++----------
 .../src/batching/gpu_preprocessing.rs         |  7 ++
 4 files changed, 78 insertions(+), 74 deletions(-)

diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index 08438fca791e4..5dab22a31bd9b 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -137,7 +137,8 @@ pub struct PbrPlugin {
     pub add_default_deferred_lighting_plugin: bool,
     /// Controls if GPU [`MeshUniform`] building is enabled.
     ///
-    /// This requires compute shader support.
+    /// This requires compute shader support and so will be forcibly disabled if
+    /// the platform doesn't support those.
     pub use_gpu_instance_buffer_builder: bool,
 }
 
@@ -146,14 +147,7 @@ impl Default for PbrPlugin {
         Self {
             prepass_enabled: true,
             add_default_deferred_lighting_plugin: true,
-
-            // The GPU instance buffer builder requires compute shaders, which
-            // aren't available on any version of WebGL.
-            use_gpu_instance_buffer_builder: cfg!(any(
-                feature = "webgpu",
-                not(feature = "webgl"),
-                not(target_arch = "wasm32"),
-            )),
+            use_gpu_instance_buffer_builder: true,
         }
     }
 }
@@ -308,6 +302,9 @@ impl Plugin for PbrPlugin {
                 ExtractComponentPlugin::<ShadowFilteringMethod>::default(),
                 LightmapPlugin,
                 LightProbePlugin,
+                GpuMeshPreprocessPlugin {
+                    use_gpu_instance_buffer_builder: self.use_gpu_instance_buffer_builder,
+                },
             ))
             .configure_sets(
                 PostUpdate,
@@ -368,10 +365,6 @@ impl Plugin for PbrPlugin {
             app.add_plugins(DeferredPbrLightingPlugin);
         }
 
-        if self.use_gpu_instance_buffer_builder {
-            app.add_plugins(GpuMeshPreprocessPlugin);
-        }
-
         app.world_mut()
             .resource_mut::<Assets<StandardMaterial>>()
             .insert(
diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index c331957366922..6ff721ba29b03 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -20,7 +20,7 @@ use bevy_ecs::{
     world::{FromWorld, World},
 };
 use bevy_render::{
-    batching::gpu_preprocessing::{BatchedInstanceBuffers, PreprocessWorkItem},
+    batching::gpu_preprocessing::{self, BatchedInstanceBuffers, PreprocessWorkItem},
     render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
     render_resource::{
         binding_types::{storage_buffer, storage_buffer_read_only},
@@ -47,7 +47,13 @@ const WORKGROUP_SIZE: usize = 64;
 ///
 /// This will only be added if the platform supports compute shaders (e.g. not
 /// on WebGL 2).
-pub struct GpuMeshPreprocessPlugin;
+pub struct GpuMeshPreprocessPlugin {
+    /// Whether we're building [`MeshUniform`]s on GPU.
+    ///
+    /// This requires compute shader support and so will be forcibly disabled if
+    /// the platform doesn't support those.
+    pub use_gpu_instance_buffer_builder: bool,
+}
 
 /// The render node for the mesh uniform building pass.
 pub struct GpuPreprocessNode {
@@ -79,22 +85,6 @@ impl Plugin for GpuMeshPreprocessPlugin {
             "mesh_preprocess.wgsl",
             Shader::from_wgsl
         );
-
-        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
-            return;
-        };
-
-        render_app.add_systems(
-            Render,
-            (
-                prepare_preprocess_pipeline.in_set(RenderSet::Prepare),
-                prepare_preprocess_bind_groups
-                    .run_if(
-                        resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
-                    )
-                    .in_set(RenderSet::PrepareBindGroups),
-            ),
-        );
     }
 
     fn finish(&self, app: &mut App) {
@@ -102,12 +92,32 @@ impl Plugin for GpuMeshPreprocessPlugin {
             return;
         };
 
+        // This plugin does nothing if GPU instance buffer building isn't in
+        // use.
+        let render_device = render_app.world().resource::<RenderDevice>();
+        if !self.use_gpu_instance_buffer_builder
+            || !gpu_preprocessing::can_preprocess_on_gpu(render_device)
+        {
+            return;
+        }
+
         // Stitch the node in.
         render_app
             .add_render_graph_node::<GpuPreprocessNode>(Core3d, NodePbr::GpuPreprocess)
             .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, Node3d::Prepass))
             .init_resource::<PreprocessPipeline>()
-            .init_resource::<SpecializedComputePipelines<PreprocessPipeline>>();
+            .init_resource::<SpecializedComputePipelines<PreprocessPipeline>>()
+            .add_systems(
+                Render,
+                (
+                    prepare_preprocess_pipeline.in_set(RenderSet::Prepare),
+                    prepare_preprocess_bind_groups
+                        .run_if(
+                            resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+                        )
+                        .in_set(RenderSet::PrepareBindGroups),
+                )
+            );
     }
 }
 
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 8470a6efce92d..402197d80904b 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -51,10 +51,12 @@ use self::irradiance_volume::IRRADIANCE_VOLUMES_ARE_USABLE;
 use super::skin::SkinIndices;
 
 /// Provides support for rendering PBR meshes.
+#[derive(Default)]
 pub struct MeshRenderPlugin {
     /// Whether we're building [`MeshUniform`]s on GPU.
     ///
-    /// If this is false, we're building them on CPU.
+    /// This requires compute shader support and so will be forcibly disabled if
+    /// the platform doesn't support those.
     pub use_gpu_instance_buffer_builder: bool,
 }
 
@@ -79,14 +81,6 @@ pub const MORPH_HANDLE: Handle<Shader> = Handle::weak_from_u128(9709828135876073
 #[cfg(debug_assertions)]
 pub const MESH_PIPELINE_VIEW_LAYOUT_SAFE_MAX_TEXTURES: usize = 10;
 
-impl Default for MeshRenderPlugin {
-    fn default() -> Self {
-        Self {
-            use_gpu_instance_buffer_builder: true,
-        }
-    }
-}
-
 impl Plugin for MeshRenderPlugin {
     fn build(&self, app: &mut App) {
         load_internal_asset!(app, FORWARD_IO_HANDLE, "forward_io.wgsl", Shader::from_wgsl);
@@ -138,16 +132,12 @@ impl Plugin for MeshRenderPlugin {
         ));
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            let render_mesh_instances =
-                RenderMeshInstances::new(self.use_gpu_instance_buffer_builder);
-
             render_app
                 .init_resource::<MeshBindGroups>()
                 .init_resource::<SkinUniform>()
                 .init_resource::<SkinIndices>()
                 .init_resource::<MorphUniform>()
                 .init_resource::<MorphIndices>()
-                .insert_resource(render_mesh_instances)
                 .add_systems(ExtractSchedule, (extract_skins, extract_morphs))
                 .add_systems(
                     ExtractSchedule,
@@ -162,9 +152,24 @@ impl Plugin for MeshRenderPlugin {
                         prepare_mesh_view_bind_groups.in_set(RenderSet::PrepareBindGroups),
                     ),
                 );
+        }
+    }
+
+    fn finish(&self, app: &mut App) {
+        let mut mesh_bindings_shader_defs = Vec::with_capacity(1);
+
+        if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
+            let render_device = render_app.world().resource::<RenderDevice>();
+            let use_gpu_instance_buffer_builder = self.use_gpu_instance_buffer_builder
+                && gpu_preprocessing::can_preprocess_on_gpu(render_device);
+
+            let render_mesh_instances = RenderMeshInstances::new(use_gpu_instance_buffer_builder);
+            render_app.insert_resource(render_mesh_instances);
 
-            if self.use_gpu_instance_buffer_builder {
+            if use_gpu_instance_buffer_builder {
                 render_app
+                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>(
+                    )
                     .add_systems(
                         ExtractSchedule,
                         extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
@@ -180,7 +185,11 @@ impl Plugin for MeshRenderPlugin {
                         ),
                     );
             } else {
+                let render_device = render_app.world().resource::<RenderDevice>();
+                let cpu_batched_instance_buffer =
+                    no_gpu_preprocessing::BatchedInstanceBuffer::<MeshUniform>::new(render_device);
                 render_app
+                    .insert_resource(cpu_batched_instance_buffer)
                     .add_systems(
                         ExtractSchedule,
                         extract_meshes_for_cpu_building.in_set(ExtractMeshesSet),
@@ -190,23 +199,6 @@ impl Plugin for MeshRenderPlugin {
                         no_gpu_preprocessing::write_batched_instance_buffer::<MeshPipeline>
                             .in_set(RenderSet::PrepareResourcesFlush),
                     );
-            }
-        }
-    }
-
-    fn finish(&self, app: &mut App) {
-        let mut mesh_bindings_shader_defs = Vec::with_capacity(1);
-
-        if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            if self.use_gpu_instance_buffer_builder {
-                render_app
-                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>(
-                    );
-            } else {
-                let render_device = render_app.world().resource::<RenderDevice>();
-                let cpu_batched_instance_buffer =
-                    no_gpu_preprocessing::BatchedInstanceBuffer::<MeshUniform>::new(render_device);
-                render_app.insert_resource(cpu_batched_instance_buffer);
             };
 
             let render_device = render_app.world().resource::<RenderDevice>();
@@ -1725,7 +1717,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         SRes<RenderAssets<Mesh>>,
         SRes<RenderMeshInstances>,
         SRes<PipelineCache>,
-        SRes<PreprocessPipeline>,
+        Option<SRes<PreprocessPipeline>>,
     );
     type ViewQuery = Has<PreprocessBindGroup>;
     type ItemQuery = ();
@@ -1744,16 +1736,18 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
         // If we're using GPU preprocessing, then we're dependent on that
         // compute shader having been run, which of course can only happen if
         // it's compiled. Otherwise, our mesh instance data won't be present.
-        if !has_preprocess_bind_group
-            || !preprocess_pipeline
-                .pipeline_id
-                .is_some_and(|preprocess_pipeline_id| {
-                    pipeline_cache
-                        .get_compute_pipeline(preprocess_pipeline_id)
-                        .is_some()
-                })
-        {
-            return RenderCommandResult::Failure;
+        if let Some(preprocess_pipeline) = preprocess_pipeline {
+            if !has_preprocess_bind_group
+                || !preprocess_pipeline
+                    .pipeline_id
+                    .is_some_and(|preprocess_pipeline_id| {
+                        pipeline_cache
+                            .get_compute_pipeline(preprocess_pipeline_id)
+                            .is_some()
+                    })
+            {
+                return RenderCommandResult::Failure;
+            }
         }
 
         let meshes = meshes.into_inner();
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 4334494fcb08b..9870045c015af 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -300,3 +300,10 @@ pub fn write_batched_instance_buffers<GFBD>(
         work_item_buffer.write_buffer(&render_device, &render_queue);
     }
 }
+
+/// Determines whether it's possible to run preprocessing on the GPU.
+///
+/// Currently, this simply checks to see whether compute shaders are supported.
+pub fn can_preprocess_on_gpu(render_device: &RenderDevice) -> bool {
+    render_device.limits().max_compute_workgroup_size_x > 0
+}

From 202e9ebeb42b715aa467e7a88610aa528834cb9c Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 8 Apr 2024 01:57:52 -0700
Subject: [PATCH 34/39] Address review comment

---
 crates/bevy_pbr/src/render/mesh.rs                | 15 ++++++++++-----
 .../bevy_render/src/batching/gpu_preprocessing.rs |  6 +++---
 crates/bevy_render/src/batching/mod.rs            |  4 ++--
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 402197d80904b..f80fe961c8ea8 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -36,6 +36,7 @@ use bevy_utils::{tracing::error, Entry, HashMap, Parallel};
 #[cfg(debug_assertions)]
 use bevy_utils::warn_once;
 use bytemuck::{Pod, Zeroable};
+use nonmax::NonMaxU32;
 use static_assertions::const_assert_eq;
 
 use crate::render::{
@@ -379,7 +380,7 @@ pub struct RenderMeshInstanceGpu {
     /// distance sorting).
     pub translation: Vec3,
     /// The index of the [`MeshInputUniform`] in the buffer.
-    pub current_uniform_index: u32,
+    pub current_uniform_index: NonMaxU32,
 }
 
 /// CPU data that the render world needs to keep about each entity that contains
@@ -778,7 +779,10 @@ fn collect_meshes_for_gpu_building(
                 transform: builder.transform.to_transpose(),
                 lightmap_uv_rect: builder.lightmap_uv_rect,
                 flags: builder.mesh_flags.bits(),
-                previous_input_index: previous_input_index.unwrap_or(u32::MAX),
+                previous_input_index: match previous_input_index {
+                    Some(previous_input_index) => previous_input_index.into(),
+                    None => u32::MAX,
+                },
             }) as u32;
 
             // Record the [`RenderMeshInstance`].
@@ -787,7 +791,8 @@ fn collect_meshes_for_gpu_building(
                 RenderMeshInstanceGpu {
                     translation: builder.transform.translation,
                     shared: builder.shared,
-                    current_uniform_index,
+                    current_uniform_index: NonMaxU32::try_from(current_uniform_index)
+                        .unwrap_or_default(),
                 },
             );
         }
@@ -953,7 +958,7 @@ impl GetFullBatchData for MeshPipeline {
     fn get_index_of_batch_input(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
-    ) -> Option<(u32, Option<Self::CompareData>)> {
+    ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
         // This should only be called during GPU building.
         let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
             error!(
@@ -998,7 +1003,7 @@ impl GetFullBatchData for MeshPipeline {
     fn get_index_of_binned_batch_input(
         (mesh_instances, _): &SystemParamItem<Self::Param>,
         entity: Entity,
-    ) -> Option<u32> {
+    ) -> Option<NonMaxU32> {
         // This should only be called during GPU building.
         let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
             error!(
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 9870045c015af..1d12fc264920e 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -178,7 +178,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
             let output_index = data_buffer.add() as u32;
 
             work_item_buffer.push(PreprocessWorkItem {
-                input_index,
+                input_index: input_index.into(),
                 output_index,
             });
 
@@ -230,7 +230,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 let output_index = data_buffer.add() as u32;
 
                 work_item_buffer.push(PreprocessWorkItem {
-                    input_index,
+                    input_index: input_index.into(),
                     output_index,
                 });
 
@@ -261,7 +261,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                 let output_index = data_buffer.add() as u32;
 
                 work_item_buffer.push(PreprocessWorkItem {
-                    input_index,
+                    input_index: input_index.into(),
                     output_index,
                 });
 
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index e461fee6bf6cc..8d7faa3023bb3 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -120,7 +120,7 @@ pub trait GetFullBatchData: GetBatchData {
     fn get_index_of_batch_input(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
-    ) -> Option<(u32, Option<Self::CompareData>)>;
+    ) -> Option<(NonMaxU32, Option<Self::CompareData>)>;
 
     /// Returns the index of the [`GetFullBatchData::BufferInputData`] that the
     /// GPU preprocessing phase will use, for the binning path.
@@ -132,7 +132,7 @@ pub trait GetFullBatchData: GetBatchData {
     fn get_index_of_binned_batch_input(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
-    ) -> Option<u32>;
+    ) -> Option<NonMaxU32>;
 }
 
 /// A system that runs early in extraction and clears out all the

From eff1d4ec1a4b0da01c03224fd5e3dade718b8031 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 8 Apr 2024 21:46:06 -0700
Subject: [PATCH 35/39] Rename `get_index_of_batch_data` and
 `get_index_of_binned_batch_input` per Discord discussion

---
 crates/bevy_pbr/src/render/mesh.rs                   | 8 ++++----
 crates/bevy_render/src/batching/gpu_preprocessing.rs | 6 +++---
 crates/bevy_render/src/batching/mod.rs               | 8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index f80fe961c8ea8..be95d6feaeb0b 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -955,14 +955,14 @@ impl GetBatchData for MeshPipeline {
 impl GetFullBatchData for MeshPipeline {
     type BufferInputData = MeshInputUniform;
 
-    fn get_index_of_batch_input(
+    fn get_index_and_compare_data(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
         // This should only be called during GPU building.
         let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
             error!(
-                "`get_index_of_batch_input` should never be called in CPU mesh uniform building \
+                "`get_index_and_compare_data` should never be called in CPU mesh uniform building \
                 mode"
             );
             return None;
@@ -1000,14 +1000,14 @@ impl GetFullBatchData for MeshPipeline {
         ))
     }
 
-    fn get_index_of_binned_batch_input(
+    fn get_binned_index(
         (mesh_instances, _): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<NonMaxU32> {
         // This should only be called during GPU building.
         let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
             error!(
-                "`get_index_of_binned_batch_input` should never be called in CPU mesh uniform \
+                "`get_binned_index` should never be called in CPU mesh uniform \
                 building mode"
             );
             return None;
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index 1d12fc264920e..de243d1940866 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -174,7 +174,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
 
         super::batch_and_prepare_sorted_render_phase::<I, GFBD>(&mut phase, |item| {
             let (input_index, compare_data) =
-                GFBD::get_index_of_batch_input(&system_param_item, item.entity())?;
+                GFBD::get_index_and_compare_data(&system_param_item, item.entity())?;
             let output_index = data_buffer.add() as u32;
 
             work_item_buffer.push(PreprocessWorkItem {
@@ -223,7 +223,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
             let mut batch: Option<BinnedRenderPhaseBatch> = None;
             for &entity in &phase.batchable_values[key] {
                 let Some(input_index) =
-                    GFBD::get_index_of_binned_batch_input(&system_param_item, entity)
+                    GFBD::get_binned_index(&system_param_item, entity)
                 else {
                     continue;
                 };
@@ -254,7 +254,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
             let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
             for &entity in &unbatchables.entities {
                 let Some(input_index) =
-                    GFBD::get_index_of_binned_batch_input(&system_param_item, entity)
+                    GFBD::get_binned_index(&system_param_item, entity)
                 else {
                     continue;
                 };
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 8d7faa3023bb3..0407ebe36a3a9 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -83,7 +83,7 @@ pub trait GetBatchData {
     ///
     /// This is only called when building instance data on CPU. In the GPU
     /// instance data building path, we use
-    /// [`GetFullBatchData::get_index_of_batch_input`] instead.
+    /// [`GetFullBatchData::get_index_and_compare_data`] instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
@@ -104,7 +104,7 @@ pub trait GetFullBatchData: GetBatchData {
     ///
     /// This is only called when building uniforms on CPU. In the GPU instance
     /// buffer building path, we use
-    /// [`GetFullBatchData::get_index_of_batch_input`] instead.
+    /// [`GetFullBatchData::get_index_and_compare_data`] instead.
     fn get_binned_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
@@ -117,7 +117,7 @@ pub trait GetFullBatchData: GetBatchData {
     /// extraction phase before we got here, so this function shouldn't need to
     /// look up any render data. If CPU instance buffer building is in use, this
     /// function will never be called.
-    fn get_index_of_batch_input(
+    fn get_index_and_compare_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(NonMaxU32, Option<Self::CompareData>)>;
@@ -129,7 +129,7 @@ pub trait GetFullBatchData: GetBatchData {
     /// extraction phase before we got here, so this function shouldn't need to
     /// look up any render data. If CPU instance buffer building is in use, this
     /// function will never be called.
-    fn get_index_of_binned_batch_input(
+    fn get_binned_index(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<NonMaxU32>;

From 87e5cbf7b491f994c21f3ee0221b18d288b0a522 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Mon, 8 Apr 2024 21:48:25 -0700
Subject: [PATCH 36/39] Rustfmt police

---
 crates/bevy_render/src/batching/gpu_preprocessing.rs | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
index de243d1940866..8757943419eb6 100644
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -222,9 +222,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
         for key in &phase.batchable_keys {
             let mut batch: Option<BinnedRenderPhaseBatch> = None;
             for &entity in &phase.batchable_values[key] {
-                let Some(input_index) =
-                    GFBD::get_binned_index(&system_param_item, entity)
-                else {
+                let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else {
                     continue;
                 };
                 let output_index = data_buffer.add() as u32;
@@ -253,9 +251,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
         for key in &phase.unbatchable_keys {
             let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
             for &entity in &unbatchables.entities {
-                let Some(input_index) =
-                    GFBD::get_binned_index(&system_param_item, entity)
-                else {
+                let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else {
                     continue;
                 };
                 let output_index = data_buffer.add() as u32;

From 9714d443659f81a59e90479e2cc5f2a32438d8c5 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Tue, 9 Apr 2024 12:57:04 -0700
Subject: [PATCH 37/39] Run preprocessing before the shadow pass; seems to fix
 flickering issues

---
 crates/bevy_pbr/src/render/gpu_preprocess.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index 6ff721ba29b03..e8c3f0c376693 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -105,6 +105,7 @@ impl Plugin for GpuMeshPreprocessPlugin {
         render_app
             .add_render_graph_node::<GpuPreprocessNode>(Core3d, NodePbr::GpuPreprocess)
             .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, Node3d::Prepass))
+            .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, NodePbr::ShadowPass))
             .init_resource::<PreprocessPipeline>()
             .init_resource::<SpecializedComputePipelines<PreprocessPipeline>>()
             .add_systems(

From a85cde172275f7c650585e61d901cb89de2cedfc Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Tue, 9 Apr 2024 13:02:02 -0700
Subject: [PATCH 38/39] Address review comments

---
 crates/bevy_pbr/src/prepass/mod.rs           | 13 +++++--------
 crates/bevy_pbr/src/render/gpu_preprocess.rs |  8 ++++----
 crates/bevy_pbr/src/render/mesh.rs           |  2 +-
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index 59c999671011e..230b7455d83c8 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -144,14 +144,11 @@ where
                         update_mesh_previous_global_transforms,
                         update_previous_view_data,
                     ),
-                );
-        }
-
-        if no_prepass_plugin_loaded {
-            app.add_plugins((
-                BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::default(),
-                BinnedRenderPhasePlugin::<AlphaMask3dPrepass, MeshPipeline>::default(),
-            ));
+                )
+                .add_plugins((
+                    BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::default(),
+                    BinnedRenderPhasePlugin::<AlphaMask3dPrepass, MeshPipeline>::default(),
+                ));
         }
 
         let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
index e8c3f0c376693..21eff19668a65 100644
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -206,13 +206,13 @@ impl FromWorld for PreprocessPipeline {
             ShaderStages::COMPUTE,
             (
                 // `current_input`
-                storage_buffer_read_only::<MeshInputUniform>(/*has_dynamic_offset=*/ false),
+                storage_buffer_read_only::<MeshInputUniform>(false),
                 // `previous_input`
-                storage_buffer_read_only::<MeshInputUniform>(/*has_dynamic_offset=*/ false),
+                storage_buffer_read_only::<MeshInputUniform>(false),
                 // `indices`
-                storage_buffer_read_only::<PreprocessWorkItem>(/*has_dynamic_offset=*/ false),
+                storage_buffer_read_only::<PreprocessWorkItem>(false),
                 // `output`
-                storage_buffer::<MeshUniform>(/*has_dynamic_offset=*/ false),
+                storage_buffer::<MeshUniform>(false),
             ),
         );
 
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index 0d7cc81e7abbc..ddc81604f4d13 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -51,7 +51,7 @@ use self::irradiance_volume::IRRADIANCE_VOLUMES_ARE_USABLE;
 
 use super::skin::SkinIndices;
 
-/// Provides support for rendering PBR meshes.
+/// Provides support for rendering 3D meshes.
 #[derive(Default)]
 pub struct MeshRenderPlugin {
     /// Whether we're building [`MeshUniform`]s on GPU.

From aacaa177298855180d3bd8d48c008a2fa567ab66 Mon Sep 17 00:00:00 2001
From: Patrick Walton <pcwalton@mimiga.net>
Date: Tue, 9 Apr 2024 13:02:45 -0700
Subject: [PATCH 39/39] Fix formatting

---
 crates/bevy_render/src/batching/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 0407ebe36a3a9..6811451f37dbd 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -77,7 +77,7 @@ pub trait GetBatchData {
     /// instances.
     type BufferData: GpuArrayBufferable + Sync + Send + 'static;
     /// Get the per-instance data to be inserted into the
-    /// [`crate::render_resource::GpuArrayBuffer`].  If the instance can be
+    /// [`crate::render_resource::GpuArrayBuffer`]. If the instance can be
     /// batched, also return the data used for comparison when deciding whether
     /// draws can be batched, else return None for the `CompareData`.
     ///