diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs
index 6603df730696f..7d7b7c9afe9ab 100644
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@@ -79,6 +79,8 @@ pub mod graph {
         /// Label for the screen space ambient occlusion render node.
         ScreenSpaceAmbientOcclusion,
         DeferredLightingPass,
+        /// Label for the compute shader instance data building pass.
+        GpuPreprocess,
     }
 }
 
@@ -133,6 +135,11 @@ pub struct PbrPlugin {
     pub prepass_enabled: bool,
     /// Controls if [`DeferredPbrLightingPlugin`] is added.
     pub add_default_deferred_lighting_plugin: bool,
+    /// Controls if GPU [`MeshUniform`] building is enabled.
+    ///
+    /// This requires compute shader support and so will be forcibly disabled if
+    /// the platform doesn't support those.
+    pub use_gpu_instance_buffer_builder: bool,
 }
 
 impl Default for PbrPlugin {
@@ -140,6 +147,7 @@ impl Default for PbrPlugin {
         Self {
             prepass_enabled: true,
             add_default_deferred_lighting_plugin: true,
+            use_gpu_instance_buffer_builder: true,
         }
     }
 }
@@ -280,7 +288,9 @@ impl Plugin for PbrPlugin {
             .register_type::<DefaultOpaqueRendererMethod>()
             .init_resource::<DefaultOpaqueRendererMethod>()
             .add_plugins((
-                MeshRenderPlugin,
+                MeshRenderPlugin {
+                    use_gpu_instance_buffer_builder: self.use_gpu_instance_buffer_builder,
+                },
                 MaterialPlugin::<StandardMaterial> {
                     prepass_enabled: self.prepass_enabled,
                     ..Default::default()
@@ -292,6 +302,9 @@ impl Plugin for PbrPlugin {
                 ExtractComponentPlugin::<ShadowFilteringMethod>::default(),
                 LightmapPlugin,
                 LightProbePlugin,
+                GpuMeshPreprocessPlugin {
+                    use_gpu_instance_buffer_builder: self.use_gpu_instance_buffer_builder,
+                },
             ))
             .configure_sets(
                 PostUpdate,
@@ -386,15 +399,6 @@ impl Plugin for PbrPlugin {
         let draw_3d_graph = graph.get_sub_graph_mut(Core3d).unwrap();
         draw_3d_graph.add_node(NodePbr::ShadowPass, shadow_pass_node);
         draw_3d_graph.add_node_edge(NodePbr::ShadowPass, Node3d::StartMainPass);
-
-        render_app.ignore_ambiguity(
-            bevy_render::Render,
-            bevy_core_pipeline::core_3d::prepare_core_3d_transmission_textures,
-            bevy_render::batching::batch_and_prepare_sorted_render_phase::<
-                bevy_core_pipeline::core_3d::Transmissive3d,
-                MeshPipeline,
-            >,
-        );
     }
 
     fn finish(&self, app: &mut App) {
diff --git a/crates/bevy_pbr/src/lightmap/mod.rs b/crates/bevy_pbr/src/lightmap/mod.rs
index a57c2b8e4618e..fbb5ea2731379 100644
--- a/crates/bevy_pbr/src/lightmap/mod.rs
+++ b/crates/bevy_pbr/src/lightmap/mod.rs
@@ -48,7 +48,7 @@ use bevy_render::{
 };
 use bevy_utils::HashSet;
 
-use crate::RenderMeshInstances;
+use crate::{ExtractMeshesSet, RenderMeshInstances};
 
 /// The ID of the lightmap shader.
 pub const LIGHTMAP_SHADER_HANDLE: Handle<Shader> =
@@ -132,10 +132,9 @@ impl Plugin for LightmapPlugin {
             return;
         };
 
-        render_app.init_resource::<RenderLightmaps>().add_systems(
-            ExtractSchedule,
-            extract_lightmaps.after(crate::extract_meshes),
-        );
+        render_app
+            .init_resource::<RenderLightmaps>()
+            .add_systems(ExtractSchedule, extract_lightmaps.after(ExtractMeshesSet));
     }
 }
 
@@ -159,8 +158,8 @@ fn extract_lightmaps(
         if !view_visibility.get()
             || images.get(&lightmap.image).is_none()
             || !render_mesh_instances
-                .get(&entity)
-                .and_then(|mesh_instance| meshes.get(mesh_instance.mesh_asset_id))
+                .mesh_asset_id(entity)
+                .and_then(|mesh_asset_id| meshes.get(mesh_asset_id))
                 .is_some_and(|mesh| mesh.layout.0.contains(Mesh::ATTRIBUTE_UV_1.id))
         {
             continue;
diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs
index 4785c4aed98db..082a07dbb6219 100644
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@@ -508,6 +508,8 @@ pub const fn screen_space_specular_transmission_pipeline_key(
     }
 }
 
+/// For each view, iterates over all the meshes visible from that view and adds
+/// them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as appropriate.
 #[allow(clippy::too_many_arguments)]
 pub fn queue_material_meshes<M: Material>(
     opaque_draw_functions: Res<DrawFunctions<Opaque3d>>,
@@ -647,7 +649,8 @@ pub fn queue_material_meshes<M: Material>(
             let Some(material_asset_id) = render_material_instances.get(visible_entity) else {
                 continue;
             };
-            let Some(mesh_instance) = render_mesh_instances.get(visible_entity) else {
+            let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(*visible_entity)
+            else {
                 continue;
             };
             let Some(mesh) = render_meshes.get(mesh_instance.mesh_asset_id) else {
@@ -693,8 +696,7 @@ pub fn queue_material_meshes<M: Material>(
             match material.properties.alpha_mode {
                 AlphaMode::Opaque => {
                     if material.properties.reads_view_transmission_texture {
-                        let distance = rangefinder
-                            .distance_translation(&mesh_instance.transforms.transform.translation)
+                        let distance = rangefinder.distance_translation(&mesh_instance.translation)
                             + material.properties.depth_bias;
                         transmissive_phase.add(Transmissive3d {
                             entity: *visible_entity,
@@ -717,8 +719,7 @@ pub fn queue_material_meshes<M: Material>(
                 }
                 AlphaMode::Mask(_) => {
                     if material.properties.reads_view_transmission_texture {
-                        let distance = rangefinder
-                            .distance_translation(&mesh_instance.transforms.transform.translation)
+                        let distance = rangefinder.distance_translation(&mesh_instance.translation)
                             + material.properties.depth_bias;
                         transmissive_phase.add(Transmissive3d {
                             entity: *visible_entity,
@@ -746,8 +747,7 @@ pub fn queue_material_meshes<M: Material>(
                 | AlphaMode::Premultiplied
                 | AlphaMode::Add
                 | AlphaMode::Multiply => {
-                    let distance = rangefinder
-                        .distance_translation(&mesh_instance.transforms.transform.translation)
+                    let distance = rangefinder.distance_translation(&mesh_instance.translation)
                         + material.properties.depth_bias;
                     transparent_phase.add(Transparent3d {
                         entity: *visible_entity,
diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs
index ad954cbc26f43..230b7455d83c8 100644
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@@ -1,6 +1,5 @@
 mod prepass_bindings;
 
-use bevy_render::batching::{batch_and_prepare_binned_render_phase, sort_binned_render_phase};
 use bevy_render::mesh::{GpuMesh, MeshVertexBufferLayoutRef};
 use bevy_render::render_resource::binding_types::uniform_buffer;
 pub use prepass_bindings::*;
@@ -145,7 +144,11 @@ where
                         update_mesh_previous_global_transforms,
                         update_previous_view_data,
                     ),
-                );
+                )
+                .add_plugins((
+                    BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::default(),
+                    BinnedRenderPhasePlugin::<AlphaMask3dPrepass, MeshPipeline>::default(),
+                ));
         }
 
         let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
@@ -157,18 +160,7 @@ where
                 .add_systems(ExtractSchedule, extract_camera_previous_view_data)
                 .add_systems(
                     Render,
-                    (
-                        (
-                            sort_binned_render_phase::<Opaque3dPrepass>,
-                            sort_binned_render_phase::<AlphaMask3dPrepass>
-                        ).in_set(RenderSet::PhaseSort),
-                        (
-                            prepare_previous_view_uniforms,
-                            batch_and_prepare_binned_render_phase::<Opaque3dPrepass, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<AlphaMask3dPrepass,
-                                MeshPipeline>,
-                        ).in_set(RenderSet::PrepareResources),
-                    )
+                    prepare_previous_view_uniforms.in_set(RenderSet::PrepareResources),
                 );
         }
 
@@ -786,7 +778,8 @@ pub fn queue_prepass_material_meshes<M: Material>(
             let Some(material_asset_id) = render_material_instances.get(visible_entity) else {
                 continue;
             };
-            let Some(mesh_instance) = render_mesh_instances.get(visible_entity) else {
+            let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(*visible_entity)
+            else {
                 continue;
             };
             let Some(material) = render_materials.get(*material_asset_id) else {
diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs
new file mode 100644
index 0000000000000..21eff19668a65
--- /dev/null
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@@ -0,0 +1,299 @@
+//! GPU mesh preprocessing.
+//!
+//! This is an optional pass that uses a compute shader to reduce the amount of
+//! data that has to be transferred from the CPU to the GPU. When enabled,
+//! instead of transferring [`MeshUniform`]s to the GPU, we transfer the smaller
+//! [`MeshInputUniform`]s instead and use the GPU to calculate the remaining
+//! derived fields in [`MeshUniform`].
+
+use std::num::NonZeroU64;
+
+use bevy_app::{App, Plugin};
+use bevy_asset::{load_internal_asset, Handle};
+use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
+use bevy_ecs::{
+    component::Component,
+    entity::Entity,
+    query::QueryState,
+    schedule::{common_conditions::resource_exists, IntoSystemConfigs as _},
+    system::{lifetimeless::Read, Commands, Res, ResMut, Resource},
+    world::{FromWorld, World},
+};
+use bevy_render::{
+    batching::gpu_preprocessing::{self, BatchedInstanceBuffers, PreprocessWorkItem},
+    render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
+    render_resource::{
+        binding_types::{storage_buffer, storage_buffer_read_only},
+        BindGroup, BindGroupEntries, BindGroupLayout, BindingResource, BufferBinding,
+        CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor,
+        DynamicBindGroupLayoutEntries, PipelineCache, Shader, ShaderStages, ShaderType,
+        SpecializedComputePipeline, SpecializedComputePipelines,
+    },
+    renderer::{RenderContext, RenderDevice},
+    Render, RenderApp, RenderSet,
+};
+use bevy_utils::tracing::warn;
+
+use crate::{graph::NodePbr, MeshInputUniform, MeshUniform};
+
+/// The handle to the `mesh_preprocess.wgsl` compute shader.
+pub const MESH_PREPROCESS_SHADER_HANDLE: Handle<Shader> =
+    Handle::weak_from_u128(16991728318640779533);
+
+/// The GPU workgroup size.
+const WORKGROUP_SIZE: usize = 64;
+
+/// A plugin that builds mesh uniforms on GPU.
+///
+/// This will only be added if the platform supports compute shaders (e.g. not
+/// on WebGL 2).
+pub struct GpuMeshPreprocessPlugin {
+    /// Whether we're building [`MeshUniform`]s on GPU.
+    ///
+    /// This requires compute shader support and so will be forcibly disabled if
+    /// the platform doesn't support those.
+    pub use_gpu_instance_buffer_builder: bool,
+}
+
+/// The render node for the mesh uniform building pass.
+pub struct GpuPreprocessNode {
+    view_query: QueryState<(Entity, Read<PreprocessBindGroup>)>,
+}
+
+/// The compute shader pipeline for the mesh uniform building pass.
+#[derive(Resource)]
+pub struct PreprocessPipeline {
+    /// The single bind group layout for the compute shader.
+    pub bind_group_layout: BindGroupLayout,
+    /// The pipeline ID for the compute shader.
+    ///
+    /// This gets filled in in `prepare_preprocess_pipeline`.
+    pub pipeline_id: Option<CachedComputePipelineId>,
+}
+
+/// The compute shader bind group for the mesh uniform building pass.
+///
+/// This goes on the view.
+#[derive(Component)]
+pub struct PreprocessBindGroup(BindGroup);
+
+impl Plugin for GpuMeshPreprocessPlugin {
+    fn build(&self, app: &mut App) {
+        load_internal_asset!(
+            app,
+            MESH_PREPROCESS_SHADER_HANDLE,
+            "mesh_preprocess.wgsl",
+            Shader::from_wgsl
+        );
+    }
+
+    fn finish(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        // This plugin does nothing if GPU instance buffer building isn't in
+        // use.
+        let render_device = render_app.world().resource::<RenderDevice>();
+        if !self.use_gpu_instance_buffer_builder
+            || !gpu_preprocessing::can_preprocess_on_gpu(render_device)
+        {
+            return;
+        }
+
+        // Stitch the node in.
+        render_app
+            .add_render_graph_node::<GpuPreprocessNode>(Core3d, NodePbr::GpuPreprocess)
+            .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, Node3d::Prepass))
+            .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, NodePbr::ShadowPass))
+            .init_resource::<PreprocessPipeline>()
+            .init_resource::<SpecializedComputePipelines<PreprocessPipeline>>()
+            .add_systems(
+                Render,
+                (
+                    prepare_preprocess_pipeline.in_set(RenderSet::Prepare),
+                    prepare_preprocess_bind_groups
+                        .run_if(
+                            resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+                        )
+                        .in_set(RenderSet::PrepareBindGroups),
+                )
+            );
+    }
+}
+
+impl FromWorld for GpuPreprocessNode {
+    fn from_world(world: &mut World) -> Self {
+        Self {
+            view_query: QueryState::new(world),
+        }
+    }
+}
+
+impl Node for GpuPreprocessNode {
+    fn update(&mut self, world: &mut World) {
+        self.view_query.update_archetypes(world);
+    }
+
+    fn run<'w>(
+        &self,
+        _: &mut RenderGraphContext,
+        render_context: &mut RenderContext<'w>,
+        world: &'w World,
+    ) -> Result<(), NodeRunError> {
+        // Grab the [`BatchedInstanceBuffers`].
+        let BatchedInstanceBuffers {
+            work_item_buffers: ref index_buffers,
+            ..
+        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
+
+        let pipeline_cache = world.resource::<PipelineCache>();
+        let preprocess_pipeline = world.resource::<PreprocessPipeline>();
+
+        let Some(preprocess_pipeline_id) = preprocess_pipeline.pipeline_id else {
+            warn!("The build mesh uniforms pipeline wasn't created");
+            return Ok(());
+        };
+
+        let Some(preprocess_pipeline) = pipeline_cache.get_compute_pipeline(preprocess_pipeline_id)
+        else {
+            // This will happen while the pipeline is being compiled and is fine.
+            return Ok(());
+        };
+
+        let mut compute_pass =
+            render_context
+                .command_encoder()
+                .begin_compute_pass(&ComputePassDescriptor {
+                    label: Some("mesh preprocessing"),
+                    timestamp_writes: None,
+                });
+
+        compute_pass.set_pipeline(preprocess_pipeline);
+
+        // Run the compute passes.
+        for (view, bind_group) in self.view_query.iter_manual(world) {
+            let index_buffer = &index_buffers[&view];
+            compute_pass.set_bind_group(0, &bind_group.0, &[]);
+            let workgroup_count = index_buffer.len().div_ceil(WORKGROUP_SIZE);
+            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+        }
+
+        Ok(())
+    }
+}
+
+impl SpecializedComputePipeline for PreprocessPipeline {
+    type Key = ();
+
+    fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor {
+        ComputePipelineDescriptor {
+            label: Some("mesh preprocessing".into()),
+            layout: vec![self.bind_group_layout.clone()],
+            push_constant_ranges: vec![],
+            shader: MESH_PREPROCESS_SHADER_HANDLE,
+            shader_defs: vec![],
+            entry_point: "main".into(),
+        }
+    }
+}
+
+impl FromWorld for PreprocessPipeline {
+    fn from_world(world: &mut World) -> Self {
+        let render_device = world.resource::<RenderDevice>();
+
+        let bind_group_layout_entries = DynamicBindGroupLayoutEntries::sequential(
+            ShaderStages::COMPUTE,
+            (
+                // `current_input`
+                storage_buffer_read_only::<MeshInputUniform>(false),
+                // `previous_input`
+                storage_buffer_read_only::<MeshInputUniform>(false),
+                // `indices`
+                storage_buffer_read_only::<PreprocessWorkItem>(false),
+                // `output`
+                storage_buffer::<MeshUniform>(false),
+            ),
+        );
+
+        let bind_group_layout = render_device.create_bind_group_layout(
+            "build mesh uniforms bind group layout",
+            &bind_group_layout_entries,
+        );
+
+        PreprocessPipeline {
+            bind_group_layout,
+            pipeline_id: None,
+        }
+    }
+}
+
+/// A system that specializes the `mesh_preprocess.wgsl` pipeline if necessary.
+pub fn prepare_preprocess_pipeline(
+    pipeline_cache: Res<PipelineCache>,
+    mut pipelines: ResMut<SpecializedComputePipelines<PreprocessPipeline>>,
+    mut preprocess_pipeline: ResMut<PreprocessPipeline>,
+) {
+    if preprocess_pipeline.pipeline_id.is_some() {
+        return;
+    }
+
+    let preprocess_pipeline_id = pipelines.specialize(&pipeline_cache, &preprocess_pipeline, ());
+    preprocess_pipeline.pipeline_id = Some(preprocess_pipeline_id);
+}
+
+/// A system that attaches the mesh uniform buffers to the bind group for the
+/// compute shader.
+pub fn prepare_preprocess_bind_groups(
+    mut commands: Commands,
+    render_device: Res<RenderDevice>,
+    batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    pipeline: Res<PreprocessPipeline>,
+) {
+    // Grab the `BatchedInstanceBuffers`.
+    let BatchedInstanceBuffers {
+        data_buffer: ref data_buffer_vec,
+        work_item_buffers: ref index_buffers,
+        current_input_buffer: ref current_input_buffer_vec,
+        previous_input_buffer: ref previous_input_buffer_vec,
+    } = batched_instance_buffers.into_inner();
+
+    let (Some(current_input_buffer), Some(previous_input_buffer), Some(data_buffer)) = (
+        current_input_buffer_vec.buffer(),
+        previous_input_buffer_vec.buffer(),
+        data_buffer_vec.buffer(),
+    ) else {
+        return;
+    };
+
+    for (view, index_buffer_vec) in index_buffers {
+        let Some(index_buffer) = index_buffer_vec.buffer() else {
+            continue;
+        };
+
+        // Don't use `as_entire_binding()` here; the shader reads the array
+        // length and the underlying buffer may be longer than the actual size
+        // of the vector.
+        let index_buffer_size = NonZeroU64::try_from(
+            index_buffer_vec.len() as u64 * u64::from(PreprocessWorkItem::min_size()),
+        )
+        .ok();
+
+        commands
+            .entity(*view)
+            .insert(PreprocessBindGroup(render_device.create_bind_group(
+                "preprocess_bind_group",
+                &pipeline.bind_group_layout,
+                &BindGroupEntries::sequential((
+                    current_input_buffer.as_entire_binding(),
+                    previous_input_buffer.as_entire_binding(),
+                    BindingResource::Buffer(BufferBinding {
+                        buffer: index_buffer,
+                        offset: 0,
+                        size: index_buffer_size,
+                    }),
+                    data_buffer.as_entire_binding(),
+                )),
+            )));
+    }
+}
diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs
index dfc338f2fb3e1..070e1a9d7b99d 100644
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@@ -1596,6 +1596,9 @@ pub fn prepare_clusters(
     }
 }
 
+/// For each shadow cascade, iterates over all the meshes "visible" from it and
+/// adds them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as
+/// appropriate.
 #[allow(clippy::too_many_arguments)]
 pub fn queue_shadows<M: Material>(
     shadow_draw_functions: Res<DrawFunctions<Shadow>>,
@@ -1651,10 +1654,14 @@ pub fn queue_shadows<M: Material>(
             light_key.set(MeshPipelineKey::DEPTH_CLAMP_ORTHO, is_directional_light);
 
             for entity in visible_entities.iter().copied() {
-                let Some(mesh_instance) = render_mesh_instances.get(&entity) else {
+                let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(entity)
+                else {
                     continue;
                 };
-                if !mesh_instance.shadow_caster {
+                if !mesh_instance
+                    .flags
+                    .contains(RenderMeshInstanceFlags::SHADOW_CASTER)
+                {
                     continue;
                 }
                 let Some(material_asset_id) = render_material_instances.get(&entity) else {
diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs
index f64fa2b2b5f12..ddc81604f4d13 100644
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@@ -1,3 +1,5 @@
+use std::mem;
+
 use bevy_asset::{load_internal_asset, AssetId};
 use bevy_core_pipeline::{
     core_3d::{AlphaMask3d, Opaque3d, Transmissive3d, Transparent3d, CORE_3D_DEPTH_FORMAT},
@@ -10,20 +12,22 @@ use bevy_ecs::{
     query::ROQueryItem,
     system::{lifetimeless::*, SystemParamItem, SystemState},
 };
-use bevy_math::{Affine3, Rect, UVec2, Vec4};
+use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4};
 use bevy_render::{
     batching::{
-        batch_and_prepare_binned_render_phase, batch_and_prepare_sorted_render_phase,
-        sort_binned_render_phase, write_batched_instance_buffer, GetBatchData, GetBinnedBatchData,
-        NoAutomaticBatching,
+        clear_batched_instance_buffers, gpu_preprocessing, no_gpu_preprocessing, GetBatchData,
+        GetFullBatchData, NoAutomaticBatching,
     },
     mesh::*,
     render_asset::RenderAssets,
-    render_phase::{PhaseItem, RenderCommand, RenderCommandResult, TrackedRenderPass},
+    render_phase::{
+        BinnedRenderPhasePlugin, PhaseItem, RenderCommand, RenderCommandResult,
+        SortedRenderPhasePlugin, TrackedRenderPass,
+    },
     render_resource::*,
     renderer::{RenderDevice, RenderQueue},
     texture::{BevyDefault, DefaultImageSampler, ImageSampler, TextureFormatPixelInfo},
-    view::{ViewTarget, ViewUniformOffset, ViewVisibility},
+    view::{prepare_view_targets, ViewTarget, ViewUniformOffset, ViewVisibility},
     Extract,
 };
 use bevy_transform::components::GlobalTransform;
@@ -31,6 +35,8 @@ use bevy_utils::{tracing::error, Entry, HashMap, Parallel};
 
 #[cfg(debug_assertions)]
 use bevy_utils::warn_once;
+use bytemuck::{Pod, Zeroable};
+use nonmax::NonMaxU32;
 use static_assertions::const_assert_eq;
 
 use crate::render::{
@@ -45,8 +51,15 @@ use self::irradiance_volume::IRRADIANCE_VOLUMES_ARE_USABLE;
 
 use super::skin::SkinIndices;
 
+/// Provides support for rendering 3D meshes.
 #[derive(Default)]
-pub struct MeshRenderPlugin;
+pub struct MeshRenderPlugin {
+    /// Whether we're building [`MeshUniform`]s on GPU.
+    ///
+    /// This requires compute shader support and so will be forcibly disabled if
+    /// the platform doesn't support those.
+    pub use_gpu_instance_buffer_builder: bool,
+}
 
 pub const FORWARD_IO_HANDLE: Handle<Shader> = Handle::weak_from_u128(2645551199423808407);
 pub const MESH_VIEW_TYPES_HANDLE: Handle<Shader> = Handle::weak_from_u128(8140454348013264787);
@@ -108,47 +121,32 @@ impl Plugin for MeshRenderPlugin {
         app.add_systems(
             PostUpdate,
             (no_automatic_skin_batching, no_automatic_morph_batching),
-        );
+        )
+        .add_plugins((
+            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::default(),
+            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::default(),
+            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::default(),
+        ));
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
             render_app
-                .init_resource::<RenderMeshInstances>()
                 .init_resource::<MeshBindGroups>()
                 .init_resource::<SkinUniform>()
                 .init_resource::<SkinIndices>()
                 .init_resource::<MorphUniform>()
                 .init_resource::<MorphIndices>()
-                .allow_ambiguous_resource::<GpuArrayBuffer<MeshUniform>>()
+                .add_systems(ExtractSchedule, (extract_skins, extract_morphs))
                 .add_systems(
                     ExtractSchedule,
-                    (extract_meshes, extract_skins, extract_morphs),
+                    clear_batched_instance_buffers::<MeshPipeline>.before(ExtractMeshesSet),
                 )
                 .add_systems(
                     Render,
                     (
-                        (
-                            sort_binned_render_phase::<Opaque3d>,
-                            sort_binned_render_phase::<AlphaMask3d>,
-                            sort_binned_render_phase::<Shadow>,
-                            sort_binned_render_phase::<Opaque3dDeferred>,
-                            sort_binned_render_phase::<AlphaMask3dDeferred>,
-                        )
-                            .in_set(RenderSet::PhaseSort),
-                        (
-                            batch_and_prepare_binned_render_phase::<Opaque3d, MeshPipeline>,
-                            batch_and_prepare_sorted_render_phase::<Transmissive3d, MeshPipeline>,
-                            batch_and_prepare_sorted_render_phase::<Transparent3d, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<AlphaMask3d, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<Shadow, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<Opaque3dDeferred, MeshPipeline>,
-                            batch_and_prepare_binned_render_phase::<
-                                AlphaMask3dDeferred,
-                                MeshPipeline,
-                            >,
-                        )
-                            .in_set(RenderSet::PrepareResources),
-                        write_batched_instance_buffer::<MeshPipeline>
-                            .in_set(RenderSet::PrepareResourcesFlush),
                         prepare_skins.in_set(RenderSet::PrepareResources),
                         prepare_morphs.in_set(RenderSet::PrepareResources),
                         prepare_mesh_bind_group.in_set(RenderSet::PrepareBindGroups),
@@ -162,20 +160,59 @@ impl Plugin for MeshRenderPlugin {
         let mut mesh_bindings_shader_defs = Vec::with_capacity(1);
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            if let Some(per_object_buffer_batch_size) = GpuArrayBuffer::<MeshUniform>::batch_size(
-                render_app.world().resource::<RenderDevice>(),
-            ) {
+            let render_device = render_app.world().resource::<RenderDevice>();
+            let use_gpu_instance_buffer_builder = self.use_gpu_instance_buffer_builder
+                && gpu_preprocessing::can_preprocess_on_gpu(render_device);
+
+            let render_mesh_instances = RenderMeshInstances::new(use_gpu_instance_buffer_builder);
+            render_app.insert_resource(render_mesh_instances);
+
+            if use_gpu_instance_buffer_builder {
+                render_app
+                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>(
+                    )
+                    .add_systems(
+                        ExtractSchedule,
+                        extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
+                    )
+                    .add_systems(
+                        Render,
+                        (
+                            gpu_preprocessing::write_batched_instance_buffers::<MeshPipeline>
+                                .in_set(RenderSet::PrepareResourcesFlush),
+                            gpu_preprocessing::delete_old_work_item_buffers::<MeshPipeline>
+                                .in_set(RenderSet::ManageViews)
+                                .after(prepare_view_targets),
+                        ),
+                    );
+            } else {
+                let render_device = render_app.world().resource::<RenderDevice>();
+                let cpu_batched_instance_buffer =
+                    no_gpu_preprocessing::BatchedInstanceBuffer::<MeshUniform>::new(render_device);
+                render_app
+                    .insert_resource(cpu_batched_instance_buffer)
+                    .add_systems(
+                        ExtractSchedule,
+                        extract_meshes_for_cpu_building.in_set(ExtractMeshesSet),
+                    )
+                    .add_systems(
+                        Render,
+                        no_gpu_preprocessing::write_batched_instance_buffer::<MeshPipeline>
+                            .in_set(RenderSet::PrepareResourcesFlush),
+                    );
+            };
+
+            let render_device = render_app.world().resource::<RenderDevice>();
+            if let Some(per_object_buffer_batch_size) =
+                GpuArrayBuffer::<MeshUniform>::batch_size(render_device)
+            {
                 mesh_bindings_shader_defs.push(ShaderDefVal::UInt(
                     "PER_OBJECT_BUFFER_BATCH_SIZE".into(),
                     per_object_buffer_batch_size,
                 ));
             }
 
-            render_app
-                .insert_resource(GpuArrayBuffer::<MeshUniform>::new(
-                    render_app.world().resource::<RenderDevice>(),
-                ))
-                .init_resource::<MeshPipeline>();
+            render_app.init_resource::<MeshPipeline>();
         }
 
         // Load the mesh_bindings shader module here as it depends on runtime information about
@@ -221,6 +258,36 @@ pub struct MeshUniform {
     pub lightmap_uv_rect: UVec2,
 }
 
+/// Information that has to be transferred from CPU to GPU in order to produce
+/// the full [`MeshUniform`].
+///
+/// This is essentially a subset of the fields in [`MeshUniform`] above.
+#[derive(ShaderType, Pod, Zeroable, Clone, Copy)]
+#[repr(C)]
+pub struct MeshInputUniform {
+    /// Affine 4x3 matrix transposed to 3x4.
+    pub transform: [Vec4; 3],
+    /// Four 16-bit unsigned normalized UV values packed into a `UVec2`:
+    ///
+    /// ```text
+    ///                         <--- MSB                   LSB --->
+    ///                         +---- min v ----+ +---- min u ----+
+    ///     lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu,
+    ///                         +---- max v ----+ +---- max u ----+
+    ///     lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU,
+    ///
+    /// (MSB: most significant bit; LSB: least significant bit.)
+    /// ```
+    pub lightmap_uv_rect: UVec2,
+    /// Various [`MeshFlags`].
+    pub flags: u32,
+    /// The index of this mesh's [`MeshInputUniform`] in the previous frame's
+    /// buffer, if applicable.
+    ///
+    /// This is used for TAA. If not present, this will be `u32::MAX`.
+    pub previous_input_index: u32,
+}
+
 impl MeshUniform {
     pub fn new(mesh_transforms: &MeshTransforms, maybe_lightmap_uv_rect: Option<Rect>) -> Self {
         let (inverse_transpose_model_a, inverse_transpose_model_b) =
@@ -250,26 +317,263 @@ bitflags::bitflags! {
     }
 }
 
-pub struct RenderMeshInstance {
+impl MeshFlags {
+    fn from_components(
+        transform: &GlobalTransform,
+        not_shadow_receiver: bool,
+        transmitted_receiver: bool,
+    ) -> MeshFlags {
+        let mut mesh_flags = if not_shadow_receiver {
+            MeshFlags::empty()
+        } else {
+            MeshFlags::SHADOW_RECEIVER
+        };
+        if transmitted_receiver {
+            mesh_flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER;
+        }
+        if transform.affine().matrix3.determinant().is_sign_positive() {
+            mesh_flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
+        }
+
+        mesh_flags
+    }
+}
+
+bitflags::bitflags! {
+    /// Various useful flags for [`RenderMeshInstance`]s.
+    #[derive(Clone, Copy)]
+    pub struct RenderMeshInstanceFlags: u8 {
+        /// The mesh casts shadows.
+        const SHADOW_CASTER           = 1 << 0;
+        /// The mesh can participate in automatic batching.
+        const AUTOMATIC_BATCHING      = 1 << 1;
+        /// The mesh had a transform last frame and so is eligible for TAA.
+        const HAVE_PREVIOUS_TRANSFORM = 1 << 2;
+    }
+}
+
+/// CPU data that the render world keeps for each entity, when *not* using GPU
+/// mesh uniform building.
+#[derive(Deref)]
+pub struct RenderMeshInstanceCpu {
+    /// Data shared between both the CPU mesh uniform building and the GPU mesh
+    /// uniform building paths.
+    #[deref]
+    pub shared: RenderMeshInstanceShared,
+    /// The transform of the mesh.
+    ///
+    /// This will be written into the [`MeshUniform`] at the appropriate time.
     pub transforms: MeshTransforms,
+}
+
+/// CPU data that the render world needs to keep for each entity that contains a
+/// mesh when using GPU mesh uniform building.
+#[derive(Deref)]
+pub struct RenderMeshInstanceGpu {
+    /// Data shared between both the CPU mesh uniform building and the GPU mesh
+    /// uniform building paths.
+    #[deref]
+    pub shared: RenderMeshInstanceShared,
+    /// The translation of the mesh.
+    ///
+    /// This is the only part of the transform that we have to keep on CPU (for
+    /// distance sorting).
+    pub translation: Vec3,
+    /// The index of the [`MeshInputUniform`] in the buffer.
+    pub current_uniform_index: NonMaxU32,
+}
+
+/// CPU data that the render world needs to keep about each entity that contains
+/// a mesh.
+pub struct RenderMeshInstanceShared {
+    /// The [`AssetId`] of the mesh.
     pub mesh_asset_id: AssetId<Mesh>,
+    /// A slot for the material bind group ID.
+    ///
+    /// This is filled in during [`crate::material::queue_material_meshes`].
     pub material_bind_group_id: AtomicMaterialBindGroupId,
-    pub shadow_caster: bool,
-    pub automatic_batching: bool,
+    /// Various flags.
+    pub flags: RenderMeshInstanceFlags,
 }
 
-impl RenderMeshInstance {
+/// Information that is gathered during the parallel portion of mesh extraction
+/// when GPU mesh uniform building is enabled.
+///
+/// From this, the [`MeshInputUniform`] and [`RenderMeshInstanceGpu`] are
+/// prepared.
+pub struct RenderMeshInstanceGpuBuilder {
+    /// Data that will be placed on the [`RenderMeshInstanceGpu`].
+    pub shared: RenderMeshInstanceShared,
+    /// The current transform.
+    pub transform: Affine3,
+    /// Four 16-bit unsigned normalized UV values packed into a [`UVec2`]:
+    ///
+    /// ```text
+    ///                         <--- MSB                   LSB --->
+    ///                         +---- min v ----+ +---- min u ----+
+    ///     lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu,
+    ///                         +---- max v ----+ +---- max u ----+
+    ///     lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU,
+    ///
+    /// (MSB: most significant bit; LSB: least significant bit.)
+    /// ```
+    pub lightmap_uv_rect: UVec2,
+    /// Various flags.
+    pub mesh_flags: MeshFlags,
+}
+
+impl RenderMeshInstanceShared {
+    fn from_components(
+        previous_transform: Option<&PreviousGlobalTransform>,
+        handle: &Handle<Mesh>,
+        not_shadow_caster: bool,
+        no_automatic_batching: bool,
+    ) -> Self {
+        let mut mesh_instance_flags = RenderMeshInstanceFlags::empty();
+        mesh_instance_flags.set(RenderMeshInstanceFlags::SHADOW_CASTER, !not_shadow_caster);
+        mesh_instance_flags.set(
+            RenderMeshInstanceFlags::AUTOMATIC_BATCHING,
+            !no_automatic_batching,
+        );
+        mesh_instance_flags.set(
+            RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM,
+            previous_transform.is_some(),
+        );
+
+        RenderMeshInstanceShared {
+            mesh_asset_id: handle.id(),
+
+            flags: mesh_instance_flags,
+            material_bind_group_id: AtomicMaterialBindGroupId::default(),
+        }
+    }
+
+    /// Returns true if this entity is eligible to participate in automatic
+    /// batching.
     pub fn should_batch(&self) -> bool {
-        self.automatic_batching && self.material_bind_group_id.get().is_some()
+        self.flags
+            .contains(RenderMeshInstanceFlags::AUTOMATIC_BATCHING)
+            && self.material_bind_group_id.get().is_some()
+    }
+}
+
+/// Information that the render world keeps about each entity that contains a
+/// mesh.
+///
+/// The set of information needed is different depending on whether CPU or GPU
+/// [`MeshUniform`] building is in use.
+#[derive(Resource)]
+pub enum RenderMeshInstances {
+    /// Information needed when using CPU mesh instance data building.
+    CpuBuilding(RenderMeshInstancesCpu),
+    /// Information needed when using GPU mesh instance data building.
+    GpuBuilding(RenderMeshInstancesGpu),
+}
+
+/// Information that the render world keeps about each entity that contains a
+/// mesh, when using CPU mesh instance data building.
+#[derive(Default, Deref, DerefMut)]
+pub struct RenderMeshInstancesCpu(EntityHashMap<RenderMeshInstanceCpu>);
+
+/// Information that the render world keeps about each entity that contains a
+/// mesh, when using GPU mesh instance data building.
+#[derive(Default, Deref, DerefMut)]
+pub struct RenderMeshInstancesGpu(EntityHashMap<RenderMeshInstanceGpu>);
+
+impl RenderMeshInstances {
+    /// Creates a new [`RenderMeshInstances`] instance.
+    fn new(use_gpu_instance_buffer_builder: bool) -> RenderMeshInstances {
+        if use_gpu_instance_buffer_builder {
+            RenderMeshInstances::GpuBuilding(RenderMeshInstancesGpu::default())
+        } else {
+            RenderMeshInstances::CpuBuilding(RenderMeshInstancesCpu::default())
+        }
+    }
+
+    /// Returns the ID of the mesh asset attached to the given entity, if any.
+    pub(crate) fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
+        match *self {
+            RenderMeshInstances::CpuBuilding(ref instances) => instances.mesh_asset_id(entity),
+            RenderMeshInstances::GpuBuilding(ref instances) => instances.mesh_asset_id(entity),
+        }
     }
+
+    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
+    /// mesh attached.
+    pub fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+        match *self {
+            RenderMeshInstances::CpuBuilding(ref instances) => {
+                instances.render_mesh_queue_data(entity)
+            }
+            RenderMeshInstances::GpuBuilding(ref instances) => {
+                instances.render_mesh_queue_data(entity)
+            }
+        }
+    }
+}
+
+pub(crate) trait RenderMeshInstancesTable {
+    /// Returns the ID of the mesh asset attached to the given entity, if any.
+    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>>;
+
+    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
+    /// mesh attached.
+    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData>;
 }
 
-#[derive(Default, Resource, Deref, DerefMut)]
-pub struct RenderMeshInstances(EntityHashMap<RenderMeshInstance>);
+impl RenderMeshInstancesTable for RenderMeshInstancesCpu {
+    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
+        self.get(&entity).map(|instance| instance.mesh_asset_id)
+    }
+
+    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+        self.get(&entity).map(|instance| RenderMeshQueueData {
+            shared: &instance.shared,
+            translation: instance.transforms.transform.translation,
+        })
+    }
+}
 
-pub fn extract_meshes(
+impl RenderMeshInstancesTable for RenderMeshInstancesGpu {
+    /// Returns the ID of the mesh asset attached to the given entity, if any.
+    fn mesh_asset_id(&self, entity: Entity) -> Option<AssetId<Mesh>> {
+        self.get(&entity).map(|instance| instance.mesh_asset_id)
+    }
+
+    /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a
+    /// mesh attached.
+    fn render_mesh_queue_data(&self, entity: Entity) -> Option<RenderMeshQueueData> {
+        self.get(&entity).map(|instance| RenderMeshQueueData {
+            shared: &instance.shared,
+            translation: instance.translation,
+        })
+    }
+}
+
+/// Data that [`crate::material::queue_material_meshes`] and similar systems
+/// need in order to place entities that contain meshes in the right batch.
+#[derive(Deref)]
+pub struct RenderMeshQueueData<'a> {
+    /// General information about the mesh instance.
+    #[deref]
+    pub shared: &'a RenderMeshInstanceShared,
+    /// The translation of the mesh instance.
+    pub translation: Vec3,
+}
+
+/// A [`SystemSet`] that encompasses both [`extract_meshes_for_cpu_building`]
+/// and [`extract_meshes_for_gpu_building`].
+#[derive(SystemSet, Clone, PartialEq, Eq, Debug, Hash)]
+pub struct ExtractMeshesSet;
+
+/// Extracts meshes from the main world into the render world, populating the
+/// [`RenderMeshInstances`].
+///
+/// This is the variant of the system that runs when we're *not* using GPU
+/// [`MeshUniform`] building.
+pub fn extract_meshes_for_cpu_building(
     mut render_mesh_instances: ResMut<RenderMeshInstances>,
-    mut thread_local_queues: Local<Parallel<Vec<(Entity, RenderMeshInstance)>>>,
+    mut render_mesh_instance_queues: Local<Parallel<Vec<(Entity, RenderMeshInstanceCpu)>>>,
     meshes_query: Extract<
         Query<(
             Entity,
@@ -299,45 +603,202 @@ pub fn extract_meshes(
             if !view_visibility.get() {
                 return;
             }
-            let transform = transform.affine();
-            let previous_transform = previous_transform.map(|t| t.0).unwrap_or(transform);
-            let mut flags = if not_shadow_receiver {
-                MeshFlags::empty()
-            } else {
-                MeshFlags::SHADOW_RECEIVER
-            };
-            if transmitted_receiver {
-                flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER;
-            }
-            if transform.matrix3.determinant().is_sign_positive() {
-                flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
-            }
-            let transforms = MeshTransforms {
-                transform: (&transform).into(),
-                previous_transform: (&previous_transform).into(),
-                flags: flags.bits(),
-            };
-            thread_local_queues.scope(|queue| {
+
+            let mesh_flags =
+                MeshFlags::from_components(transform, not_shadow_receiver, transmitted_receiver);
+
+            let shared = RenderMeshInstanceShared::from_components(
+                previous_transform,
+                handle,
+                not_shadow_caster,
+                no_automatic_batching,
+            );
+
+            render_mesh_instance_queues.scope(|queue| {
+                let transform = transform.affine();
                 queue.push((
                     entity,
-                    RenderMeshInstance {
-                        mesh_asset_id: handle.id(),
-                        transforms,
-                        shadow_caster: !not_shadow_caster,
-                        material_bind_group_id: AtomicMaterialBindGroupId::default(),
-                        automatic_batching: !no_automatic_batching,
+                    RenderMeshInstanceCpu {
+                        transforms: MeshTransforms {
+                            transform: (&transform).into(),
+                            previous_transform: (&previous_transform
+                                .map(|t| t.0)
+                                .unwrap_or(transform))
+                                .into(),
+                            flags: mesh_flags.bits(),
+                        },
+                        shared,
                     },
                 ));
             });
         },
     );
 
+    // Collect the render mesh instances.
+    let RenderMeshInstances::CpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances
+    else {
+        panic!(
+            "`extract_meshes_for_cpu_building` should only be called if we're using CPU \
+            `MeshUniform` building"
+        );
+    };
+
     render_mesh_instances.clear();
-    for queue in thread_local_queues.iter_mut() {
+    for queue in render_mesh_instance_queues.iter_mut() {
         render_mesh_instances.extend(queue.drain(..));
     }
 }
 
+/// Extracts meshes from the main world into the render world and queues
+/// [`MeshInputUniform`]s to be uploaded to the GPU.
+///
+/// This is the variant of the system that runs when we're using GPU
+/// [`MeshUniform`] building.
+pub fn extract_meshes_for_gpu_building(
+    mut render_mesh_instances: ResMut<RenderMeshInstances>,
+    mut batched_instance_buffers: ResMut<
+        gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
+    >,
+    mut render_mesh_instance_queues: Local<Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>>,
+    mut prev_render_mesh_instances: Local<RenderMeshInstancesGpu>,
+    meshes_query: Extract<
+        Query<(
+            Entity,
+            &ViewVisibility,
+            &GlobalTransform,
+            Option<&PreviousGlobalTransform>,
+            Option<&Lightmap>,
+            &Handle<Mesh>,
+            Has<NotShadowReceiver>,
+            Has<TransmittedShadowReceiver>,
+            Has<NotShadowCaster>,
+            Has<NoAutomaticBatching>,
+        )>,
+    >,
+) {
+    meshes_query.par_iter().for_each(
+        |(
+            entity,
+            view_visibility,
+            transform,
+            previous_transform,
+            lightmap,
+            handle,
+            not_shadow_receiver,
+            transmitted_receiver,
+            not_shadow_caster,
+            no_automatic_batching,
+        )| {
+            if !view_visibility.get() {
+                return;
+            }
+
+            let mesh_flags =
+                MeshFlags::from_components(transform, not_shadow_receiver, transmitted_receiver);
+
+            let shared = RenderMeshInstanceShared::from_components(
+                previous_transform,
+                handle,
+                not_shadow_caster,
+                no_automatic_batching,
+            );
+
+            let lightmap_uv_rect =
+                lightmap::pack_lightmap_uv_rect(lightmap.map(|lightmap| lightmap.uv_rect));
+
+            render_mesh_instance_queues.scope(|queue| {
+                queue.push((
+                    entity,
+                    RenderMeshInstanceGpuBuilder {
+                        shared,
+                        transform: (&transform.affine()).into(),
+                        lightmap_uv_rect,
+                        mesh_flags,
+                    },
+                ));
+            });
+        },
+    );
+
+    collect_meshes_for_gpu_building(
+        &mut render_mesh_instances,
+        &mut batched_instance_buffers,
+        &mut render_mesh_instance_queues,
+        &mut prev_render_mesh_instances,
+    );
+}
+
+/// Creates the [`RenderMeshInstanceGpu`]s and [`MeshInputUniform`]s when GPU
+/// mesh uniforms are built.
+fn collect_meshes_for_gpu_building(
+    render_mesh_instances: &mut RenderMeshInstances,
+    batched_instance_buffers: &mut gpu_preprocessing::BatchedInstanceBuffers<
+        MeshUniform,
+        MeshInputUniform,
+    >,
+    render_mesh_instance_queues: &mut Parallel<Vec<(Entity, RenderMeshInstanceGpuBuilder)>>,
+    prev_render_mesh_instances: &mut RenderMeshInstancesGpu,
+) {
+    // Collect render mesh instances. Build up the uniform buffer.
+    let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances
+    else {
+        panic!(
+            "`collect_render_mesh_instances_for_gpu_building` should only be called if we're \
+            using GPU `MeshUniform` building"
+        );
+    };
+
+    let gpu_preprocessing::BatchedInstanceBuffers {
+        ref mut current_input_buffer,
+        ref mut previous_input_buffer,
+        ..
+    } = batched_instance_buffers;
+
+    // Swap buffers.
+    mem::swap(current_input_buffer, previous_input_buffer);
+    mem::swap(render_mesh_instances, prev_render_mesh_instances);
+
+    // Build the [`RenderMeshInstance`]s and [`MeshInputUniform`]s.
+    render_mesh_instances.clear();
+    for queue in render_mesh_instance_queues.iter_mut() {
+        for (entity, builder) in queue.drain(..) {
+            let previous_input_index = if builder
+                .shared
+                .flags
+                .contains(RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM)
+            {
+                prev_render_mesh_instances
+                    .get(&entity)
+                    .map(|render_mesh_instance| render_mesh_instance.current_uniform_index)
+            } else {
+                None
+            };
+
+            // Push the mesh input uniform.
+            let current_uniform_index = current_input_buffer.push(MeshInputUniform {
+                transform: builder.transform.to_transpose(),
+                lightmap_uv_rect: builder.lightmap_uv_rect,
+                flags: builder.mesh_flags.bits(),
+                previous_input_index: match previous_input_index {
+                    Some(previous_input_index) => previous_input_index.into(),
+                    None => u32::MAX,
+                },
+            }) as u32;
+
+            // Record the [`RenderMeshInstance`].
+            render_mesh_instances.insert(
+                entity,
+                RenderMeshInstanceGpu {
+                    translation: builder.transform.translation,
+                    shared: builder.shared,
+                    current_uniform_index: NonMaxU32::try_from(current_uniform_index)
+                        .unwrap_or_default(),
+                },
+            );
+        }
+    }
+}
+
 #[derive(Resource, Clone)]
 pub struct MeshPipeline {
     view_layouts: [MeshPipelineViewLayout; MeshPipelineViewLayoutKey::COUNT],
@@ -467,6 +928,13 @@ impl GetBatchData for MeshPipeline {
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
+        let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
+            error!(
+                "`get_batch_data` should never be called in GPU mesh uniform \
+                building mode"
+            );
+            return None;
+        };
         let mesh_instance = mesh_instances.get(&entity)?;
         let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
 
@@ -484,15 +952,45 @@ impl GetBatchData for MeshPipeline {
     }
 }
 
-impl GetBinnedBatchData for MeshPipeline {
-    type Param = (SRes<RenderMeshInstances>, SRes<RenderLightmaps>);
+impl GetFullBatchData for MeshPipeline {
+    type BufferInputData = MeshInputUniform;
 
-    type BufferData = MeshUniform;
+    fn get_index_and_compare_data(
+        (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
+        entity: Entity,
+    ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
+        // This should only be called during GPU building.
+        let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
+            error!(
+                "`get_index_and_compare_data` should never be called in CPU mesh uniform building \
+                mode"
+            );
+            return None;
+        };
 
-    fn get_batch_data(
+        let mesh_instance = mesh_instances.get(&entity)?;
+        let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
+
+        Some((
+            mesh_instance.current_uniform_index,
+            mesh_instance.should_batch().then_some((
+                mesh_instance.material_bind_group_id.get(),
+                mesh_instance.mesh_asset_id,
+                maybe_lightmap.map(|lightmap| lightmap.image),
+            )),
+        ))
+    }
+
+    fn get_binned_batch_data(
         (mesh_instances, lightmaps): &SystemParamItem<Self::Param>,
         entity: Entity,
     ) -> Option<Self::BufferData> {
+        let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
+            error!(
+                "`get_binned_batch_data` should never be called in GPU mesh uniform building mode"
+            );
+            return None;
+        };
         let mesh_instance = mesh_instances.get(&entity)?;
         let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
 
@@ -501,6 +999,24 @@ impl GetBinnedBatchData for MeshPipeline {
             maybe_lightmap.map(|lightmap| lightmap.uv_rect),
         ))
     }
+
+    fn get_binned_index(
+        (mesh_instances, _): &SystemParamItem<Self::Param>,
+        entity: Entity,
+    ) -> Option<NonMaxU32> {
+        // This should only be called during GPU building.
+        let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
+            error!(
+                "`get_binned_index` should never be called in CPU mesh uniform \
+                building mode"
+            );
+            return None;
+        };
+
+        mesh_instances
+            .get(&entity)
+            .map(|entity| entity.current_uniform_index)
+    }
 }
 
 bitflags::bitflags! {
@@ -1031,16 +1547,32 @@ pub fn prepare_mesh_bind_group(
     mut groups: ResMut<MeshBindGroups>,
     mesh_pipeline: Res<MeshPipeline>,
     render_device: Res<RenderDevice>,
-    mesh_uniforms: Res<GpuArrayBuffer<MeshUniform>>,
+    cpu_batched_instance_buffer: Option<
+        Res<no_gpu_preprocessing::BatchedInstanceBuffer<MeshUniform>>,
+    >,
+    gpu_batched_instance_buffers: Option<
+        Res<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+    >,
     skins_uniform: Res<SkinUniform>,
     weights_uniform: Res<MorphUniform>,
     render_lightmaps: Res<RenderLightmaps>,
 ) {
     groups.reset();
     let layouts = &mesh_pipeline.mesh_layouts;
-    let Some(model) = mesh_uniforms.binding() else {
+
+    let model = if let Some(cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
+        cpu_batched_instance_buffer
+            .into_inner()
+            .instance_data_binding()
+    } else if let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
+        gpu_batched_instance_buffers
+            .into_inner()
+            .instance_data_binding()
+    } else {
         return;
     };
+    let Some(model) = model else { return };
+
     groups.model_only = Some(layouts.model_only(&render_device, &model));
 
     let skin = skins_uniform.buffer.buffer();
@@ -1140,7 +1672,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
 
         let entity = &item.entity();
 
-        let Some(mesh) = mesh_instances.get(entity) else {
+        let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(*entity) else {
             return RenderCommandResult::Success;
         };
         let skin_index = skin_indices.get(entity);
@@ -1154,8 +1686,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
             .get(entity)
             .map(|render_lightmap| render_lightmap.image);
 
-        let Some(bind_group) =
-            bind_groups.get(mesh.mesh_asset_id, lightmap, is_skinned, is_morphed)
+        let Some(bind_group) = bind_groups.get(mesh_asset_id, lightmap, is_skinned, is_morphed)
         else {
             error!(
                 "The MeshBindGroups resource wasn't set in the render phase. \
@@ -1187,24 +1718,50 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
 
 pub struct DrawMesh;
 impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
-    type Param = (SRes<RenderAssets<GpuMesh>>, SRes<RenderMeshInstances>);
-    type ViewQuery = ();
+    type Param = (
+        SRes<RenderAssets<GpuMesh>>,
+        SRes<RenderMeshInstances>,
+        SRes<PipelineCache>,
+        Option<SRes<PreprocessPipeline>>,
+    );
+    type ViewQuery = Has<PreprocessBindGroup>;
     type ItemQuery = ();
     #[inline]
     fn render<'w>(
         item: &P,
-        _view: (),
+        has_preprocess_bind_group: ROQueryItem<Self::ViewQuery>,
         _item_query: Option<()>,
-        (meshes, mesh_instances): SystemParamItem<'w, '_, Self::Param>,
+        (meshes, mesh_instances, pipeline_cache, preprocess_pipeline): SystemParamItem<
+            'w,
+            '_,
+            Self::Param,
+        >,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
+        // If we're using GPU preprocessing, then we're dependent on that
+        // compute shader having been run, which of course can only happen if
+        // it's compiled. Otherwise, our mesh instance data won't be present.
+        if let Some(preprocess_pipeline) = preprocess_pipeline {
+            if !has_preprocess_bind_group
+                || !preprocess_pipeline
+                    .pipeline_id
+                    .is_some_and(|preprocess_pipeline_id| {
+                        pipeline_cache
+                            .get_compute_pipeline(preprocess_pipeline_id)
+                            .is_some()
+                    })
+            {
+                return RenderCommandResult::Failure;
+            }
+        }
+
         let meshes = meshes.into_inner();
         let mesh_instances = mesh_instances.into_inner();
 
-        let Some(mesh_instance) = mesh_instances.get(&item.entity()) else {
+        let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else {
             return RenderCommandResult::Failure;
         };
-        let Some(gpu_mesh) = meshes.get(mesh_instance.mesh_asset_id) else {
+        let Some(gpu_mesh) = meshes.get(mesh_asset_id) else {
             return RenderCommandResult::Failure;
         };
 
diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
new file mode 100644
index 0000000000000..c4adaa5105623
--- /dev/null
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -0,0 +1,86 @@
+// GPU mesh uniform building.
+//
+// This is a compute shader that expands each `MeshInputUniform` out to a full
+// `MeshUniform` for each view before rendering. (Thus `MeshInputUniform`
+// and `MeshUniform` are in a 1:N relationship.) It runs in parallel for all
+// meshes for all views. As part of this process, the shader gathers each
+// mesh's transform on the previous frame and writes it into the `MeshUniform`
+// so that TAA works.
+
+#import bevy_pbr::mesh_types::Mesh
+#import bevy_render::maths
+
+// Per-frame data that the CPU supplies to the GPU.
+struct MeshInput {
+    // The model transform.
+    model: mat3x4<f32>,
+    // The lightmap UV rect, packed into 64 bits.
+    lightmap_uv_rect: vec2<u32>,
+    // Various flags.
+    flags: u32,
+    // The index of this mesh's `MeshInput` in the `previous_input` array, if
+    // applicable. If not present, this is `u32::MAX`.
+    previous_input_index: u32,
+}
+
+// One invocation of this compute shader: i.e. one mesh instance in a view.
+struct PreprocessWorkItem {
+    // The index of the `MeshInput` in the `current_input` buffer that we read
+    // from.
+    input_index: u32,
+    // The index of the `Mesh` in `output` that we write to.
+    output_index: u32,
+}
+
+// The current frame's `MeshInput`.
+@group(0) @binding(0) var<storage> current_input: array<MeshInput>;
+// The `MeshInput` values from the previous frame.
+@group(0) @binding(1) var<storage> previous_input: array<MeshInput>;
+// Indices into the `MeshInput` buffer.
+//
+// There may be many indices that map to the same `MeshInput`.
+@group(0) @binding(2) var<storage> work_items: array<PreprocessWorkItem>;
+// The output array of `Mesh`es.
+@group(0) @binding(3) var<storage, read_write> output: array<Mesh>;
+
+@compute
+@workgroup_size(64)
+fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
+    let instance_index = global_invocation_id.x;
+    if (instance_index >= arrayLength(&work_items)) {
+        return;
+    }
+
+    // Unpack.
+    let mesh_index = work_items[instance_index].input_index;
+    let output_index = work_items[instance_index].output_index;
+    let model_affine_transpose = current_input[mesh_index].model;
+    let model = maths::affine3_to_square(model_affine_transpose);
+
+    // Calculate inverse transpose.
+    let inverse_transpose_model = transpose(maths::inverse_affine3(transpose(
+        model_affine_transpose)));
+
+    // Pack inverse transpose.
+    let inverse_transpose_model_a = mat2x4<f32>(
+        vec4<f32>(inverse_transpose_model[0].xyz, inverse_transpose_model[1].x),
+        vec4<f32>(inverse_transpose_model[1].yz, inverse_transpose_model[2].xy));
+    let inverse_transpose_model_b = inverse_transpose_model[2].z;
+
+    // Look up the previous model matrix.
+    let previous_input_index = current_input[mesh_index].previous_input_index;
+    var previous_model: mat3x4<f32>;
+    if (previous_input_index == 0xffffffff) {
+        previous_model = model_affine_transpose;
+    } else {
+        previous_model = previous_input[previous_input_index].model;
+    }
+
+    // Write the output.
+    output[output_index].model = model_affine_transpose;
+    output[output_index].previous_model = previous_model;
+    output[output_index].inverse_transpose_model_a = inverse_transpose_model_a;
+    output[output_index].inverse_transpose_model_b = inverse_transpose_model_b;
+    output[output_index].flags = current_input[mesh_index].flags;
+    output[output_index].lightmap_uv_rect = current_input[mesh_index].lightmap_uv_rect;
+}
diff --git a/crates/bevy_pbr/src/render/mod.rs b/crates/bevy_pbr/src/render/mod.rs
index 7efffc05681f8..53bc9bcde14b4 100644
--- a/crates/bevy_pbr/src/render/mod.rs
+++ b/crates/bevy_pbr/src/render/mod.rs
@@ -1,4 +1,5 @@
 mod fog;
+mod gpu_preprocess;
 mod light;
 pub(crate) mod mesh;
 mod mesh_bindings;
@@ -7,6 +8,7 @@ mod morph;
 mod skin;
 
 pub use fog::*;
+pub use gpu_preprocess::*;
 pub use light::*;
 pub use mesh::*;
 pub use mesh_bindings::MeshLayouts;
diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs
new file mode 100644
index 0000000000000..8757943419eb6
--- /dev/null
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@@ -0,0 +1,305 @@
+//! Batching functionality when GPU preprocessing is in use.
+
+use std::marker::PhantomData;
+
+use bevy_ecs::{
+    entity::Entity,
+    query::With,
+    system::{Query, Res, ResMut, Resource, StaticSystemParam},
+};
+use bevy_encase_derive::ShaderType;
+use bevy_utils::EntityHashMap;
+use bytemuck::{Pod, Zeroable};
+use smallvec::smallvec;
+use wgpu::{BindingResource, BufferUsages};
+
+use crate::{
+    render_phase::{
+        BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
+        SortedPhaseItem, SortedRenderPhase,
+    },
+    render_resource::{BufferVec, GpuArrayBufferIndex, GpuArrayBufferable, UninitBufferVec},
+    renderer::{RenderDevice, RenderQueue},
+    view::ViewTarget,
+};
+
+use super::GetFullBatchData;
+
+/// The GPU buffers holding the data needed to render batches.
+///
+/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
+/// `BD` type parameter in that mode.
+///
+/// We have a separate *buffer data input* type (`BDI`) here, which a compute
+/// shader is expected to expand to the full buffer data (`BD`) type. GPU
+/// uniform building is generally faster and uses less system RAM to VRAM bus
+/// bandwidth, but only implemented for some pipelines (for example, not in the
+/// 2D pipeline at present) and only when compute shader is available.
+#[derive(Resource)]
+pub struct BatchedInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    /// A storage area for the buffer data that the GPU compute shader is
+    /// expected to write to.
+    ///
+    /// There will be one entry for each index.
+    pub data_buffer: UninitBufferVec<BD>,
+
+    /// The index of the buffer data in the current input buffer that
+    /// corresponds to each instance.
+    ///
+    /// This is keyed off each view. Each view has a separate buffer.
+    pub work_item_buffers: EntityHashMap<Entity, BufferVec<PreprocessWorkItem>>,
+
+    /// The uniform data inputs for the current frame.
+    ///
+    /// These are uploaded during the extraction phase.
+    pub current_input_buffer: BufferVec<BDI>,
+
+    /// The uniform data inputs for the previous frame.
+    ///
+    /// The indices don't generally line up between `current_input_buffer`
+    /// and `previous_input_buffer`, because, among other reasons, entities
+    /// can spawn or despawn between frames. Instead, each current buffer
+    /// data input uniform is expected to contain the index of the
+    /// corresponding buffer data input uniform in this list.
+    pub previous_input_buffer: BufferVec<BDI>,
+}
+
+/// One invocation of the preprocessing shader: i.e. one mesh instance in a
+/// view.
+#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
+#[repr(C)]
+pub struct PreprocessWorkItem {
+    /// The index of the batch input data in the input buffer that the shader
+    /// reads from.
+    pub input_index: u32,
+    /// The index of the `MeshUniform` in the output buffer that we write to.
+    pub output_index: u32,
+}
+
+impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    /// Creates new buffers.
+    pub fn new() -> Self {
+        BatchedInstanceBuffers {
+            data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
+            work_item_buffers: EntityHashMap::default(),
+            current_input_buffer: BufferVec::new(BufferUsages::STORAGE),
+            previous_input_buffer: BufferVec::new(BufferUsages::STORAGE),
+        }
+    }
+
+    /// Returns the binding of the buffer that contains the per-instance data.
+    ///
+    /// This buffer needs to be filled in via a compute shader.
+    pub fn instance_data_binding(&self) -> Option<BindingResource> {
+        self.data_buffer
+            .buffer()
+            .map(|buffer| buffer.as_entire_binding())
+    }
+
+    /// Clears out the buffers in preparation for a new frame.
+    pub fn clear(&mut self) {
+        self.data_buffer.clear();
+        self.current_input_buffer.clear();
+        self.previous_input_buffer.clear();
+        for work_item_buffer in self.work_item_buffers.values_mut() {
+            work_item_buffer.clear();
+        }
+    }
+}
+
+impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+    BDI: Pod,
+{
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// A system that removes GPU preprocessing work item buffers that correspond to
+/// deleted [`ViewTarget`]s.
+///
+/// This is a separate system from [`super::clear_batched_instance_buffers`]
+/// because [`ViewTarget`]s aren't created until after the extraction phase is
+/// completed.
+pub fn delete_old_work_item_buffers<GFBD>(
+    mut gpu_batched_instance_buffers: ResMut<
+        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+    >,
+    view_targets: Query<Entity, With<ViewTarget>>,
+) where
+    GFBD: GetFullBatchData,
+{
+    gpu_batched_instance_buffers
+        .work_item_buffers
+        .retain(|entity, _| view_targets.contains(*entity));
+}
+
+/// Batch the items in a sorted render phase, when GPU instance buffer building
+/// is in use. This means comparing metadata needed to draw each phase item and
+/// trying to combine the draws into a batch.
+pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
+    gpu_batched_instance_buffers: ResMut<
+        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+    >,
+    mut views: Query<(Entity, &mut SortedRenderPhase<I>)>,
+    param: StaticSystemParam<GFBD::Param>,
+) where
+    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    let system_param_item = param.into_inner();
+
+    // We only process GPU-built batch data in this function.
+    let BatchedInstanceBuffers {
+        ref mut data_buffer,
+        ref mut work_item_buffers,
+        ..
+    } = gpu_batched_instance_buffers.into_inner();
+
+    for (view, mut phase) in &mut views {
+        // Create the work item buffer if necessary.
+        let work_item_buffer = work_item_buffers
+            .entry(view)
+            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+
+        super::batch_and_prepare_sorted_render_phase::<I, GFBD>(&mut phase, |item| {
+            let (input_index, compare_data) =
+                GFBD::get_index_and_compare_data(&system_param_item, item.entity())?;
+            let output_index = data_buffer.add() as u32;
+
+            work_item_buffer.push(PreprocessWorkItem {
+                input_index: input_index.into(),
+                output_index,
+            });
+
+            *item.batch_range_mut() = output_index..output_index + 1;
+
+            compare_data
+        });
+    }
+}
+
+/// Creates batches for a render phase that uses bins.
+pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
+    gpu_batched_instance_buffers: ResMut<
+        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+    >,
+    mut views: Query<(Entity, &mut BinnedRenderPhase<BPI>)>,
+    param: StaticSystemParam<GFBD::Param>,
+) where
+    BPI: BinnedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    let system_param_item = param.into_inner();
+
+    let BatchedInstanceBuffers {
+        ref mut data_buffer,
+        ref mut work_item_buffers,
+        ..
+    } = gpu_batched_instance_buffers.into_inner();
+
+    for (view, mut phase) in &mut views {
+        let phase = &mut *phase; // Borrow checker.
+
+        // Create the work item buffer if necessary; otherwise, just mark it as
+        // used this frame.
+        let work_item_buffer = work_item_buffers
+            .entry(view)
+            .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE));
+
+        // Prepare batchables.
+
+        for key in &phase.batchable_keys {
+            let mut batch: Option<BinnedRenderPhaseBatch> = None;
+            for &entity in &phase.batchable_values[key] {
+                let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else {
+                    continue;
+                };
+                let output_index = data_buffer.add() as u32;
+
+                work_item_buffer.push(PreprocessWorkItem {
+                    input_index: input_index.into(),
+                    output_index,
+                });
+
+                batch
+                    .get_or_insert(BinnedRenderPhaseBatch {
+                        representative_entity: entity,
+                        instance_range: output_index..output_index,
+                        dynamic_offset: None,
+                    })
+                    .instance_range
+                    .end = output_index + 1;
+            }
+
+            if let Some(batch) = batch {
+                phase.batch_sets.push(smallvec![batch]);
+            }
+        }
+
+        // Prepare unbatchables.
+        for key in &phase.unbatchable_keys {
+            let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
+            for &entity in &unbatchables.entities {
+                let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else {
+                    continue;
+                };
+                let output_index = data_buffer.add() as u32;
+
+                work_item_buffer.push(PreprocessWorkItem {
+                    input_index: input_index.into(),
+                    output_index,
+                });
+
+                unbatchables
+                    .buffer_indices
+                    .add(GpuArrayBufferIndex::<GFBD::BufferData> {
+                        index: output_index,
+                        dynamic_offset: None,
+                        element_type: PhantomData,
+                    });
+            }
+        }
+    }
+}
+
+/// A system that writes all instance buffers to the GPU.
+pub fn write_batched_instance_buffers<GFBD>(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    mut gpu_batched_instance_buffers: ResMut<
+        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+    >,
+) where
+    GFBD: GetFullBatchData,
+{
+    gpu_batched_instance_buffers
+        .data_buffer
+        .write_buffer(&render_device);
+    gpu_batched_instance_buffers
+        .current_input_buffer
+        .write_buffer(&render_device, &render_queue);
+    // There's no need to write `previous_input_buffer`, as we wrote
+    // that on the previous frame, and it hasn't changed.
+
+    for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() {
+        work_item_buffer.write_buffer(&render_device, &render_queue);
+    }
+}
+
+/// Determines whether it's possible to run preprocessing on the GPU.
+///
+/// Currently, this simply checks to see whether compute shaders are supported.
+pub fn can_preprocess_on_gpu(render_device: &RenderDevice) -> bool {
+    render_device.limits().max_compute_workgroup_size_x > 0
+}
diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs
index 576bf641fb71f..6811451f37dbd 100644
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@@ -1,21 +1,22 @@
 use bevy_ecs::{
     component::Component,
     entity::Entity,
-    prelude::Res,
-    system::{Query, ResMut, StaticSystemParam, SystemParam, SystemParamItem},
+    system::{Query, ResMut, SystemParam, SystemParamItem},
 };
+use bytemuck::Pod;
 use nonmax::NonMaxU32;
-use smallvec::{smallvec, SmallVec};
 
 use crate::{
     render_phase::{
-        BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        DrawFunctionId, SortedPhaseItem, SortedRenderPhase,
+        BinnedPhaseItem, BinnedRenderPhase, CachedRenderPipelinePhaseItem, DrawFunctionId,
+        SortedPhaseItem, SortedRenderPhase,
     },
-    render_resource::{CachedRenderPipelineId, GpuArrayBuffer, GpuArrayBufferable},
-    renderer::{RenderDevice, RenderQueue},
+    render_resource::{CachedRenderPipelineId, GpuArrayBufferable},
 };
 
+pub mod gpu_preprocessing;
+pub mod no_gpu_preprocessing;
+
 /// Add this component to mesh entities to disable automatic batching
 #[derive(Component)]
 pub struct NoAutomaticBatching;
@@ -59,6 +60,10 @@ impl<T: PartialEq> BatchMeta<T> {
 
 /// A trait to support getting data used for batching draw commands via phase
 /// items.
+///
+/// This is a simple version that only allows for sorting, not binning, as well
+/// as only CPU processing, not GPU preprocessing. For these fancier features,
+/// see [`GetFullBatchData`].
 pub trait GetBatchData {
     /// The system parameters [`GetBatchData::get_batch_data`] needs in
     /// order to compute the batch data.
@@ -67,79 +72,90 @@ pub trait GetBatchData {
     /// function id, per-instance data buffer dynamic offset and this data
     /// matches, the draws can be batched.
     type CompareData: PartialEq;
-    /// The per-instance data to be inserted into the [`GpuArrayBuffer`]
-    /// containing these data for all instances.
+    /// The per-instance data to be inserted into the
+    /// [`crate::render_resource::GpuArrayBuffer`] containing these data for all
+    /// instances.
     type BufferData: GpuArrayBufferable + Sync + Send + 'static;
-    /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
-    /// If the instance can be batched, also return the data used for
-    /// comparison when deciding whether draws can be batched, else return None
-    /// for the `CompareData`.
+    /// Get the per-instance data to be inserted into the
+    /// [`crate::render_resource::GpuArrayBuffer`]. If the instance can be
+    /// batched, also return the data used for comparison when deciding whether
+    /// draws can be batched, else return None for the `CompareData`.
+    ///
+    /// This is only called when building instance data on CPU. In the GPU
+    /// instance data building path, we use
+    /// [`GetFullBatchData::get_index_and_compare_data`] instead.
     fn get_batch_data(
         param: &SystemParamItem<Self::Param>,
         query_item: Entity,
     ) -> Option<(Self::BufferData, Option<Self::CompareData>)>;
 }
 
-/// When implemented on a pipeline, this trait allows the batching logic to
-/// compute the per-batch data that will be uploaded to the GPU.
+/// A trait to support getting data used for batching draw commands via phase
+/// items.
 ///
-/// This includes things like the mesh transforms.
-pub trait GetBinnedBatchData {
-    /// The system parameters [`GetBinnedBatchData::get_batch_data`] needs
-    /// in order to compute the batch data.
-    type Param: SystemParam + 'static;
-    /// The per-instance data to be inserted into the [`GpuArrayBuffer`]
-    /// containing these data for all instances.
-    type BufferData: GpuArrayBufferable + Sync + Send + 'static;
-
-    /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`].
-    fn get_batch_data(
+/// This version allows for binning and GPU preprocessing.
+pub trait GetFullBatchData: GetBatchData {
+    /// The per-instance data that was inserted into the
+    /// [`crate::render_resource::BufferVec`] during extraction.
+    type BufferInputData: Pod + Sync + Send;
+
+    /// Get the per-instance data to be inserted into the
+    /// [`crate::render_resource::GpuArrayBuffer`].
+    ///
+    /// This is only called when building uniforms on CPU. In the GPU instance
+    /// buffer building path, we use
+    /// [`GetFullBatchData::get_index_and_compare_data`] instead.
+    fn get_binned_batch_data(
         param: &SystemParamItem<Self::Param>,
-        entity: Entity,
+        query_item: Entity,
     ) -> Option<Self::BufferData>;
+
+    /// Returns the index of the [`GetFullBatchData::BufferInputData`] that the
+    /// GPU preprocessing phase will use.
+    ///
+    /// We already inserted the [`GetFullBatchData::BufferInputData`] during the
+    /// extraction phase before we got here, so this function shouldn't need to
+    /// look up any render data. If CPU instance buffer building is in use, this
+    /// function will never be called.
+    fn get_index_and_compare_data(
+        param: &SystemParamItem<Self::Param>,
+        query_item: Entity,
+    ) -> Option<(NonMaxU32, Option<Self::CompareData>)>;
+
+    /// Returns the index of the [`GetFullBatchData::BufferInputData`] that the
+    /// GPU preprocessing phase will use, for the binning path.
+    ///
+    /// We already inserted the [`GetFullBatchData::BufferInputData`] during the
+    /// extraction phase before we got here, so this function shouldn't need to
+    /// look up any render data. If CPU instance buffer building is in use, this
+    /// function will never be called.
+    fn get_binned_index(
+        param: &SystemParamItem<Self::Param>,
+        query_item: Entity,
+    ) -> Option<NonMaxU32>;
 }
 
-/// Batch the items in a sorted render phase. This means comparing metadata
-/// needed to draw each phase item and trying to combine the draws into a batch.
-pub fn batch_and_prepare_sorted_render_phase<I, F>(
-    gpu_array_buffer: ResMut<GpuArrayBuffer<F::BufferData>>,
-    mut views: Query<&mut SortedRenderPhase<I>>,
-    param: StaticSystemParam<F::Param>,
+/// A system that runs early in extraction and clears out all the
+/// [`gpu_preprocessing::BatchedInstanceBuffers`] for the frame.
+///
+/// We have to run this during extraction because, if GPU preprocessing is in
+/// use, the extraction phase will write to the mesh input uniform buffers
+/// directly, so the buffers need to be cleared before then.
+pub fn clear_batched_instance_buffers<GFBD>(
+    cpu_batched_instance_buffer: Option<
+        ResMut<no_gpu_preprocessing::BatchedInstanceBuffer<GFBD::BufferData>>,
+    >,
+    gpu_batched_instance_buffers: Option<
+        ResMut<gpu_preprocessing::BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    >,
 ) where
-    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
-    F: GetBatchData,
+    GFBD: GetFullBatchData,
 {
-    let gpu_array_buffer = gpu_array_buffer.into_inner();
-    let system_param_item = param.into_inner();
-
-    let mut process_item = |item: &mut I| {
-        let (buffer_data, compare_data) = F::get_batch_data(&system_param_item, item.entity())?;
-        let buffer_index = gpu_array_buffer.push(buffer_data);
-
-        let index = buffer_index.index;
-        *item.batch_range_mut() = index..index + 1;
-        *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
-
-        if I::AUTOMATIC_BATCHING {
-            compare_data.map(|compare_data| BatchMeta::new(item, compare_data))
-        } else {
-            None
-        }
-    };
-
-    for mut phase in &mut views {
-        let items = phase.items.iter_mut().map(|item| {
-            let batch_data = process_item(item);
-            (item.batch_range_mut(), batch_data)
-        });
-        items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
-            if batch_meta.is_some() && prev_batch_meta == batch_meta {
-                start_range.end = range.end;
-                (start_range, prev_batch_meta)
-            } else {
-                (range, batch_meta)
-            }
-        });
+    if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
+        cpu_batched_instance_buffer.clear();
+    }
+    if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
+        gpu_batched_instance_buffers.clear();
     }
 }
 
@@ -154,75 +170,35 @@ where
     }
 }
 
-/// Creates batches for a render phase that uses bins.
-pub fn batch_and_prepare_binned_render_phase<BPI, GBBD>(
-    gpu_array_buffer: ResMut<GpuArrayBuffer<GBBD::BufferData>>,
-    mut views: Query<&mut BinnedRenderPhase<BPI>>,
-    param: StaticSystemParam<GBBD::Param>,
+/// Batches the items in a sorted render phase.
+///
+/// This means comparing metadata needed to draw each phase item and trying to
+/// combine the draws into a batch.
+///
+/// This is common code factored out from
+/// [`gpu_preprocessing::batch_and_prepare_sorted_render_phase`] and
+/// [`no_gpu_preprocessing::batch_and_prepare_sorted_render_phase`].
+fn batch_and_prepare_sorted_render_phase<I, GBD>(
+    phase: &mut SortedRenderPhase<I>,
+    mut process_item: impl FnMut(&mut I) -> Option<GBD::CompareData>,
 ) where
-    BPI: BinnedPhaseItem,
-    GBBD: GetBinnedBatchData,
+    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    GBD: GetBatchData,
 {
-    let gpu_array_buffer = gpu_array_buffer.into_inner();
-    let system_param_item = param.into_inner();
-
-    for mut phase in &mut views {
-        let phase = &mut *phase; // Borrow checker.
-
-        // Prepare batchables.
-
-        for key in &phase.batchable_keys {
-            let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![];
-            for &entity in &phase.batchable_values[key] {
-                let Some(buffer_data) = GBBD::get_batch_data(&system_param_item, entity) else {
-                    continue;
-                };
-
-                let instance = gpu_array_buffer.push(buffer_data);
-
-                // If the dynamic offset has changed, flush the batch.
-                //
-                // This is the only time we ever have more than one batch per
-                // bin. Note that dynamic offsets are only used on platforms
-                // with no storage buffers.
-                if !batch_set.last().is_some_and(|batch| {
-                    batch.instance_range.end == instance.index
-                        && batch.dynamic_offset == instance.dynamic_offset
-                }) {
-                    batch_set.push(BinnedRenderPhaseBatch {
-                        representative_entity: entity,
-                        instance_range: instance.index..instance.index,
-                        dynamic_offset: instance.dynamic_offset,
-                    });
-                }
-
-                if let Some(batch) = batch_set.last_mut() {
-                    batch.instance_range.end = instance.index + 1;
-                }
-            }
-
-            phase.batch_sets.push(batch_set);
-        }
-
-        // Prepare unbatchables.
-        for key in &phase.unbatchable_keys {
-            let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
-            for &entity in &unbatchables.entities {
-                if let Some(buffer_data) = GBBD::get_batch_data(&system_param_item, entity) {
-                    let instance = gpu_array_buffer.push(buffer_data);
-                    unbatchables.buffer_indices.add(instance);
-                }
-            }
+    let items = phase.items.iter_mut().map(|item| {
+        let batch_data = match process_item(item) {
+            Some(compare_data) if I::AUTOMATIC_BATCHING => Some(BatchMeta::new(item, compare_data)),
+            _ => None,
+        };
+        (item.batch_range_mut(), batch_data)
+    });
+
+    items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| {
+        if batch_meta.is_some() && prev_batch_meta == batch_meta {
+            start_range.end = range.end;
+            (start_range, prev_batch_meta)
+        } else {
+            (range, batch_meta)
         }
-    }
-}
-
-pub fn write_batched_instance_buffer<F: GetBatchData>(
-    render_device: Res<RenderDevice>,
-    render_queue: Res<RenderQueue>,
-    gpu_array_buffer: ResMut<GpuArrayBuffer<F::BufferData>>,
-) {
-    let gpu_array_buffer = gpu_array_buffer.into_inner();
-    gpu_array_buffer.write_buffer(&render_device, &render_queue);
-    gpu_array_buffer.clear();
+    });
 }
diff --git a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
new file mode 100644
index 0000000000000..429fe5bb4542e
--- /dev/null
+++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
@@ -0,0 +1,151 @@
+//! Batching functionality when GPU preprocessing isn't in use.
+
+use bevy_derive::{Deref, DerefMut};
+use bevy_ecs::system::{Query, Res, ResMut, Resource, StaticSystemParam};
+use smallvec::{smallvec, SmallVec};
+use wgpu::BindingResource;
+
+use crate::{
+    render_phase::{
+        BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
+        SortedPhaseItem, SortedRenderPhase,
+    },
+    render_resource::{GpuArrayBuffer, GpuArrayBufferable},
+    renderer::{RenderDevice, RenderQueue},
+};
+
+use super::{GetBatchData, GetFullBatchData};
+
+/// The GPU buffers holding the data needed to render batches.
+///
+/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
+/// `BD` type parameter in that mode.
+#[derive(Resource, Deref, DerefMut)]
+pub struct BatchedInstanceBuffer<BD>(pub GpuArrayBuffer<BD>)
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static;
+
+impl<BD> BatchedInstanceBuffer<BD>
+where
+    BD: GpuArrayBufferable + Sync + Send + 'static,
+{
+    /// Creates a new buffer.
+    pub fn new(render_device: &RenderDevice) -> Self {
+        BatchedInstanceBuffer(GpuArrayBuffer::new(render_device))
+    }
+
+    /// Returns the binding of the buffer that contains the per-instance data.
+    ///
+    /// If we're in the GPU instance buffer building mode, this buffer needs to
+    /// be filled in via a compute shader.
+    pub fn instance_data_binding(&self) -> Option<BindingResource> {
+        self.binding()
+    }
+}
+
+/// Batch the items in a sorted render phase, when GPU instance buffer building
+/// isn't in use. This means comparing metadata needed to draw each phase item
+/// and trying to combine the draws into a batch.
+pub fn batch_and_prepare_sorted_render_phase<I, GBD>(
+    batched_instance_buffer: ResMut<BatchedInstanceBuffer<GBD::BufferData>>,
+    mut views: Query<&mut SortedRenderPhase<I>>,
+    param: StaticSystemParam<GBD::Param>,
+) where
+    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
+    GBD: GetBatchData,
+{
+    let system_param_item = param.into_inner();
+
+    // We only process CPU-built batch data in this function.
+    let batched_instance_buffer = batched_instance_buffer.into_inner();
+
+    for mut phase in &mut views {
+        super::batch_and_prepare_sorted_render_phase::<I, GBD>(&mut phase, |item| {
+            let (buffer_data, compare_data) =
+                GBD::get_batch_data(&system_param_item, item.entity())?;
+            let buffer_index = batched_instance_buffer.push(buffer_data);
+
+            let index = buffer_index.index;
+            *item.batch_range_mut() = index..index + 1;
+            *item.dynamic_offset_mut() = buffer_index.dynamic_offset;
+
+            compare_data
+        });
+    }
+}
+
+/// Creates batches for a render phase that uses bins, when GPU batch data
+/// building isn't in use.
+pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
+    mut buffer: ResMut<BatchedInstanceBuffer<GFBD::BufferData>>,
+    mut views: Query<&mut BinnedRenderPhase<BPI>>,
+    param: StaticSystemParam<GFBD::Param>,
+) where
+    BPI: BinnedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    let system_param_item = param.into_inner();
+
+    for mut phase in &mut views {
+        let phase = &mut *phase; // Borrow checker.
+
+        // Prepare batchables.
+
+        for key in &phase.batchable_keys {
+            let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![];
+            for &entity in &phase.batchable_values[key] {
+                let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity)
+                else {
+                    continue;
+                };
+                let instance = buffer.push(buffer_data);
+
+                // If the dynamic offset has changed, flush the batch.
+                //
+                // This is the only time we ever have more than one batch per
+                // bin. Note that dynamic offsets are only used on platforms
+                // with no storage buffers.
+                if !batch_set.last().is_some_and(|batch| {
+                    batch.instance_range.end == instance.index
+                        && batch.dynamic_offset == instance.dynamic_offset
+                }) {
+                    batch_set.push(BinnedRenderPhaseBatch {
+                        representative_entity: entity,
+                        instance_range: instance.index..instance.index,
+                        dynamic_offset: instance.dynamic_offset,
+                    });
+                }
+
+                if let Some(batch) = batch_set.last_mut() {
+                    batch.instance_range.end = instance.index + 1;
+                }
+            }
+
+            phase.batch_sets.push(batch_set);
+        }
+
+        // Prepare unbatchables.
+        for key in &phase.unbatchable_keys {
+            let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
+            for &entity in &unbatchables.entities {
+                let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity)
+                else {
+                    continue;
+                };
+                let instance = buffer.push(buffer_data);
+                unbatchables.buffer_indices.add(instance);
+            }
+        }
+    }
+}
+
+/// Writes the instance buffer data to the GPU.
+pub fn write_batched_instance_buffer<GBD>(
+    render_device: Res<RenderDevice>,
+    render_queue: Res<RenderQueue>,
+    mut cpu_batched_instance_buffer: ResMut<BatchedInstanceBuffer<GBD::BufferData>>,
+) where
+    GBD: GetBatchData,
+{
+    cpu_batched_instance_buffer.write_buffer(&render_device, &render_queue);
+}
diff --git a/crates/bevy_render/src/maths.wgsl b/crates/bevy_render/src/maths.wgsl
index 17d045154a317..53c254f3746f8 100644
--- a/crates/bevy_render/src/maths.wgsl
+++ b/crates/bevy_render/src/maths.wgsl
@@ -27,3 +27,27 @@ fn mat2x4_f32_to_mat3x3_unpack(
         vec3<f32>(a[1].zw, b),
     );
 }
+
+// Extracts the square portion of an affine matrix: i.e. discards the
+// translation.
+fn affine3_to_mat3x3(affine: mat4x3<f32>) -> mat3x3<f32> {
+    return mat3x3<f32>(affine[0].xyz, affine[1].xyz, affine[2].xyz);
+}
+
+// Returns the inverse of a 3x3 matrix.
+fn inverse_mat3x3(matrix: mat3x3<f32>) -> mat3x3<f32> {
+    let tmp0 = cross(matrix[1], matrix[2]);
+    let tmp1 = cross(matrix[2], matrix[0]);
+    let tmp2 = cross(matrix[0], matrix[1]);
+    let inv_det = 1.0 / dot(matrix[2], tmp2);
+    return transpose(mat3x3<f32>(tmp0 * inv_det, tmp1 * inv_det, tmp2 * inv_det));
+}
+
+// Returns the inverse of an affine matrix.
+//
+// https://en.wikipedia.org/wiki/Affine_transformation#Groups
+fn inverse_affine3(affine: mat4x3<f32>) -> mat4x3<f32> {
+    let matrix3 = affine3_to_mat3x3(affine);
+    let inv_matrix3 = inverse_mat3x3(matrix3);
+    return mat4x3<f32>(inv_matrix3[0], inv_matrix3[1], inv_matrix3[2], -(inv_matrix3 * affine[3]));
+}
diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs
index 40c4153f3fde2..58e1fa550f4e2 100644
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@@ -10,11 +10,10 @@
 //!
 //! To draw an entity, a corresponding [`PhaseItem`] has to be added to one or multiple of these
 //! render phases for each view that it is visible in.
-//! This must be done in the [`RenderSet::Queue`](crate::RenderSet::Queue).
-//! After that the render phase sorts them in the
-//! [`RenderSet::PhaseSort`](crate::RenderSet::PhaseSort).
-//! Finally the items are rendered using a single [`TrackedRenderPass`], during the
-//! [`RenderSet::Render`](crate::RenderSet::Render).
+//! This must be done in the [`RenderSet::Queue`].
+//! After that the render phase sorts them in the [`RenderSet::PhaseSort`].
+//! Finally the items are rendered using a single [`TrackedRenderPass`], during
+//! the [`RenderSet::Render`].
 //!
 //! Therefore each phase item is assigned a [`Draw`] function.
 //! These set up the state of the [`TrackedRenderPass`] (i.e. select the
@@ -29,6 +28,7 @@ mod draw;
 mod draw_state;
 mod rangefinder;
 
+use bevy_app::{App, Plugin};
 use bevy_utils::{default, hashbrown::hash_map::Entry, HashMap};
 pub use draw::*;
 pub use draw_state::*;
@@ -36,13 +36,22 @@ use encase::{internal::WriteInto, ShaderSize};
 use nonmax::NonMaxU32;
 pub use rangefinder::*;
 
-use crate::render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache};
+use crate::{
+    batching::{
+        self,
+        gpu_preprocessing::{self, BatchedInstanceBuffers},
+        no_gpu_preprocessing::{self, BatchedInstanceBuffer},
+        GetFullBatchData,
+    },
+    render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache},
+    Render, RenderApp, RenderSet,
+};
 use bevy_ecs::{
     prelude::*,
     system::{lifetimeless::SRes, SystemParamItem},
 };
 use smallvec::SmallVec;
-use std::{hash::Hash, ops::Range, slice::SliceIndex};
+use std::{hash::Hash, marker::PhantomData, ops::Range, slice::SliceIndex};
 
 /// A collection of all rendering instructions, that will be executed by the GPU, for a
 /// single render phase for a single view.
@@ -291,6 +300,101 @@ where
     }
 }
 
+/// A convenient abstraction for adding all the systems necessary for a binned
+/// render phase to the render app.
+///
+/// This is the version used when the pipeline supports GPU preprocessing: e.g.
+/// 3D PBR meshes.
+pub struct BinnedRenderPhasePlugin<BPI, GFBD>(PhantomData<(BPI, GFBD)>)
+where
+    BPI: BinnedPhaseItem,
+    GFBD: GetFullBatchData;
+
+impl<BPI, GFBD> Default for BinnedRenderPhasePlugin<BPI, GFBD>
+where
+    BPI: BinnedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    fn default() -> Self {
+        Self(PhantomData)
+    }
+}
+
+impl<BPI, GFBD> Plugin for BinnedRenderPhasePlugin<BPI, GFBD>
+where
+    BPI: BinnedPhaseItem,
+    GFBD: GetFullBatchData + Sync + Send + 'static,
+{
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.add_systems(
+            Render,
+            (
+                batching::sort_binned_render_phase::<BPI>.in_set(RenderSet::PhaseSort),
+                (
+                    no_gpu_preprocessing::batch_and_prepare_binned_render_phase::<BPI, GFBD>
+                        .run_if(resource_exists::<BatchedInstanceBuffer<GFBD::BufferData>>),
+                    gpu_preprocessing::batch_and_prepare_binned_render_phase::<BPI, GFBD>.run_if(
+                        resource_exists::<
+                            BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+                        >,
+                    ),
+                )
+                    .in_set(RenderSet::PrepareResources),
+            ),
+        );
+    }
+}
+
+/// A convenient abstraction for adding all the systems necessary for a sorted
+/// render phase to the render app.
+///
+/// This is the version used when the pipeline supports GPU preprocessing: e.g.
+/// 3D PBR meshes.
+pub struct SortedRenderPhasePlugin<SPI, GFBD>(PhantomData<(SPI, GFBD)>)
+where
+    SPI: SortedPhaseItem,
+    GFBD: GetFullBatchData;
+
+impl<SPI, GFBD> Default for SortedRenderPhasePlugin<SPI, GFBD>
+where
+    SPI: SortedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    fn default() -> Self {
+        Self(PhantomData)
+    }
+}
+
+impl<SPI, GFBD> Plugin for SortedRenderPhasePlugin<SPI, GFBD>
+where
+    SPI: SortedPhaseItem + CachedRenderPipelinePhaseItem,
+    GFBD: GetFullBatchData + Sync + Send + 'static,
+{
+    fn build(&self, app: &mut App) {
+        let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
+            return;
+        };
+
+        render_app.add_systems(
+            Render,
+            (
+                no_gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>
+                    .run_if(resource_exists::<BatchedInstanceBuffer<GFBD::BufferData>>),
+                gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>.run_if(
+                    resource_exists::<
+                        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+                    >,
+                ),
+            )
+                .in_set(RenderSet::PrepareResources),
+        );
+    }
+}
+
 impl UnbatchableBinnedEntityBufferIndex {
     /// Adds a new entity to the list of unbatchable binned entities.
     pub fn add<T>(&mut self, gpu_array_buffer_index: GpuArrayBufferIndex<T>)
@@ -463,12 +567,10 @@ where
 ///
 /// The data required for rendering an entity is extracted from the main world in the
 /// [`ExtractSchedule`](crate::ExtractSchedule).
-/// Then it has to be queued up for rendering during the
-/// [`RenderSet::Queue`](crate::RenderSet::Queue), by adding a corresponding phase item to
-/// a render phase.
+/// Then it has to be queued up for rendering during the [`RenderSet::Queue`],
+/// by adding a corresponding phase item to a render phase.
 /// Afterwards it will be possibly sorted and rendered automatically in the
-/// [`RenderSet::PhaseSort`](crate::RenderSet::PhaseSort) and
-/// [`RenderSet::Render`](crate::RenderSet::Render), respectively.
+/// [`RenderSet::PhaseSort`] and [`RenderSet::Render`], respectively.
 ///
 /// `PhaseItem`s come in two flavors: [`BinnedPhaseItem`]s and
 /// [`SortedPhaseItem`]s.
diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs
index f330d52dea50a..8a0f77daafb3d 100644
--- a/crates/bevy_render/src/render_resource/buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/buffer_vec.rs
@@ -1,3 +1,5 @@
+use std::marker::PhantomData;
+
 use crate::{
     render_resource::Buffer,
     renderer::{RenderDevice, RenderQueue},
@@ -5,6 +7,8 @@ use crate::{
 use bytemuck::{must_cast_slice, NoUninit};
 use wgpu::BufferUsages;
 
+use super::GpuArrayBufferable;
+
 /// A structure for storing raw bytes that have already been properly formatted
 /// for use by the GPU.
 ///
@@ -160,3 +164,96 @@ impl<T: NoUninit> Extend<T> for BufferVec<T> {
         self.values.extend(iter);
     }
 }
+
+/// Like a [`BufferVec`], but only reserves space on the GPU for elements
+/// instead of initializing them CPU-side.
+///
+/// This type is useful when you're accumulating "output slots" for a GPU
+/// compute shader to write into.
+///
+/// The type `T` need not be [`NoUninit`], unlike [`BufferVec`]; it only has to
+/// be [`GpuArrayBufferable`].
+pub struct UninitBufferVec<T>
+where
+    T: GpuArrayBufferable,
+{
+    buffer: Option<Buffer>,
+    len: usize,
+    capacity: usize,
+    item_size: usize,
+    buffer_usage: BufferUsages,
+    label: Option<String>,
+    label_changed: bool,
+    phantom: PhantomData<T>,
+}
+
+impl<T> UninitBufferVec<T>
+where
+    T: GpuArrayBufferable,
+{
+    /// Creates a new [`UninitBufferVec`] with the given [`BufferUsages`].
+    pub const fn new(buffer_usage: BufferUsages) -> Self {
+        Self {
+            len: 0,
+            buffer: None,
+            capacity: 0,
+            item_size: std::mem::size_of::<T>(),
+            buffer_usage,
+            label: None,
+            label_changed: false,
+            phantom: PhantomData,
+        }
+    }
+
+    /// Returns the buffer, if allocated.
+    #[inline]
+    pub fn buffer(&self) -> Option<&Buffer> {
+        self.buffer.as_ref()
+    }
+
+    /// Reserves space for one more element in the buffer and returns its index.
+    pub fn add(&mut self) -> usize {
+        let index = self.len;
+        self.len += 1;
+        index
+    }
+
+    /// Returns true if no elements have been added to this [`UninitBufferVec`].
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    /// Removes all elements from the buffer.
+    pub fn clear(&mut self) {
+        self.len = 0;
+    }
+
+    /// Materializes the buffer on the GPU with space for `capacity` elements.
+    ///
+    /// If the buffer is already big enough, this function doesn't reallocate
+    /// the buffer.
+    pub fn reserve(&mut self, capacity: usize, device: &RenderDevice) {
+        if capacity <= self.capacity && !self.label_changed {
+            return;
+        }
+
+        self.capacity = capacity;
+        let size = self.item_size * capacity;
+        self.buffer = Some(device.create_buffer(&wgpu::BufferDescriptor {
+            label: self.label.as_deref(),
+            size: size as wgpu::BufferAddress,
+            usage: BufferUsages::COPY_DST | self.buffer_usage,
+            mapped_at_creation: false,
+        }));
+
+        self.label_changed = false;
+    }
+
+    /// Materializes the buffer on the GPU, with an appropriate size for the
+    /// elements that have been pushed so far.
+    pub fn write_buffer(&mut self, device: &RenderDevice) {
+        if !self.is_empty() {
+            self.reserve(self.len, device);
+        }
+    }
+}
diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs
index edf8dd4fabf4a..257ad61dfbb7f 100644
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@@ -11,12 +11,12 @@ use bevy_ecs::{
 };
 use bevy_math::{Affine3, Vec4};
 use bevy_reflect::{std_traits::ReflectDefault, Reflect};
+use bevy_render::batching::no_gpu_preprocessing::{
+    batch_and_prepare_sorted_render_phase, write_batched_instance_buffer, BatchedInstanceBuffer,
+};
 use bevy_render::mesh::{GpuMesh, MeshVertexBufferLayoutRef};
 use bevy_render::{
-    batching::{
-        batch_and_prepare_sorted_render_phase, write_batched_instance_buffer, GetBatchData,
-        NoAutomaticBatching,
-    },
+    batching::{GetBatchData, NoAutomaticBatching},
     globals::{GlobalsBuffer, GlobalsUniform},
     mesh::{GpuBufferInfo, Mesh},
     render_asset::RenderAssets,
@@ -116,9 +116,13 @@ impl Plugin for Mesh2dRenderPlugin {
         let mut mesh_bindings_shader_defs = Vec::with_capacity(1);
 
         if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            if let Some(per_object_buffer_batch_size) = GpuArrayBuffer::<Mesh2dUniform>::batch_size(
-                render_app.world().resource::<RenderDevice>(),
-            ) {
+            let render_device = render_app.world().resource::<RenderDevice>();
+            let batched_instance_buffer =
+                BatchedInstanceBuffer::<Mesh2dUniform>::new(render_device);
+
+            if let Some(per_object_buffer_batch_size) =
+                GpuArrayBuffer::<Mesh2dUniform>::batch_size(render_device)
+            {
                 mesh_bindings_shader_defs.push(ShaderDefVal::UInt(
                     "PER_OBJECT_BUFFER_BATCH_SIZE".into(),
                     per_object_buffer_batch_size,
@@ -126,9 +130,7 @@ impl Plugin for Mesh2dRenderPlugin {
             }
 
             render_app
-                .insert_resource(GpuArrayBuffer::<Mesh2dUniform>::new(
-                    render_app.world().resource::<RenderDevice>(),
-                ))
+                .insert_resource(batched_instance_buffer)
                 .init_resource::<Mesh2dPipeline>();
         }
 
@@ -571,9 +573,9 @@ pub fn prepare_mesh2d_bind_group(
     mut commands: Commands,
     mesh2d_pipeline: Res<Mesh2dPipeline>,
     render_device: Res<RenderDevice>,
-    mesh2d_uniforms: Res<GpuArrayBuffer<Mesh2dUniform>>,
+    mesh2d_uniforms: Res<BatchedInstanceBuffer<Mesh2dUniform>>,
 ) {
-    if let Some(binding) = mesh2d_uniforms.binding() {
+    if let Some(binding) = mesh2d_uniforms.instance_data_binding() {
         commands.insert_resource(Mesh2dBindGroup {
             value: render_device.create_bind_group(
                 "mesh2d_bind_group",
diff --git a/examples/shader/shader_instancing.rs b/examples/shader/shader_instancing.rs
index b44c96e1c3029..b1c91c8973b07 100644
--- a/examples/shader/shader_instancing.rs
+++ b/examples/shader/shader_instancing.rs
@@ -127,7 +127,7 @@ fn queue_custom(
         let view_key = msaa_key | MeshPipelineKey::from_hdr(view.hdr);
         let rangefinder = view.rangefinder3d();
         for entity in &material_meshes {
-            let Some(mesh_instance) = render_mesh_instances.get(&entity) else {
+            let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(entity) else {
                 continue;
             };
             let Some(mesh) = meshes.get(mesh_instance.mesh_asset_id) else {
@@ -142,8 +142,7 @@ fn queue_custom(
                 entity,
                 pipeline,
                 draw_function: draw_custom,
-                distance: rangefinder
-                    .distance_translation(&mesh_instance.transforms.transform.translation),
+                distance: rangefinder.distance_translation(&mesh_instance.translation),
                 batch_range: 0..1,
                 dynamic_offset: None,
             });
@@ -246,7 +245,8 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMeshInstanced {
         (meshes, render_mesh_instances): SystemParamItem<'w, '_, Self::Param>,
         pass: &mut TrackedRenderPass<'w>,
     ) -> RenderCommandResult {
-        let Some(mesh_instance) = render_mesh_instances.get(&item.entity()) else {
+        let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(item.entity())
+        else {
             return RenderCommandResult::Failure;
         };
         let Some(gpu_mesh) = meshes.into_inner().get(mesh_instance.mesh_asset_id) else {