diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs index 6603df730696f..7d7b7c9afe9ab 100644 --- a/crates/bevy_pbr/src/lib.rs +++ b/crates/bevy_pbr/src/lib.rs @@ -79,6 +79,8 @@ pub mod graph { /// Label for the screen space ambient occlusion render node. ScreenSpaceAmbientOcclusion, DeferredLightingPass, + /// Label for the compute shader instance data building pass. + GpuPreprocess, } } @@ -133,6 +135,11 @@ pub struct PbrPlugin { pub prepass_enabled: bool, /// Controls if [`DeferredPbrLightingPlugin`] is added. pub add_default_deferred_lighting_plugin: bool, + /// Controls if GPU [`MeshUniform`] building is enabled. + /// + /// This requires compute shader support and so will be forcibly disabled if + /// the platform doesn't support those. + pub use_gpu_instance_buffer_builder: bool, } impl Default for PbrPlugin { @@ -140,6 +147,7 @@ impl Default for PbrPlugin { Self { prepass_enabled: true, add_default_deferred_lighting_plugin: true, + use_gpu_instance_buffer_builder: true, } } } @@ -280,7 +288,9 @@ impl Plugin for PbrPlugin { .register_type::() .init_resource::() .add_plugins(( - MeshRenderPlugin, + MeshRenderPlugin { + use_gpu_instance_buffer_builder: self.use_gpu_instance_buffer_builder, + }, MaterialPlugin:: { prepass_enabled: self.prepass_enabled, ..Default::default() @@ -292,6 +302,9 @@ impl Plugin for PbrPlugin { ExtractComponentPlugin::::default(), LightmapPlugin, LightProbePlugin, + GpuMeshPreprocessPlugin { + use_gpu_instance_buffer_builder: self.use_gpu_instance_buffer_builder, + }, )) .configure_sets( PostUpdate, @@ -386,15 +399,6 @@ impl Plugin for PbrPlugin { let draw_3d_graph = graph.get_sub_graph_mut(Core3d).unwrap(); draw_3d_graph.add_node(NodePbr::ShadowPass, shadow_pass_node); draw_3d_graph.add_node_edge(NodePbr::ShadowPass, Node3d::StartMainPass); - - render_app.ignore_ambiguity( - bevy_render::Render, - bevy_core_pipeline::core_3d::prepare_core_3d_transmission_textures, - bevy_render::batching::batch_and_prepare_sorted_render_phase::< - bevy_core_pipeline::core_3d::Transmissive3d, - MeshPipeline, - >, - ); } fn finish(&self, app: &mut App) { diff --git a/crates/bevy_pbr/src/lightmap/mod.rs b/crates/bevy_pbr/src/lightmap/mod.rs index a57c2b8e4618e..fbb5ea2731379 100644 --- a/crates/bevy_pbr/src/lightmap/mod.rs +++ b/crates/bevy_pbr/src/lightmap/mod.rs @@ -48,7 +48,7 @@ use bevy_render::{ }; use bevy_utils::HashSet; -use crate::RenderMeshInstances; +use crate::{ExtractMeshesSet, RenderMeshInstances}; /// The ID of the lightmap shader. pub const LIGHTMAP_SHADER_HANDLE: Handle = @@ -132,10 +132,9 @@ impl Plugin for LightmapPlugin { return; }; - render_app.init_resource::().add_systems( - ExtractSchedule, - extract_lightmaps.after(crate::extract_meshes), - ); + render_app + .init_resource::() + .add_systems(ExtractSchedule, extract_lightmaps.after(ExtractMeshesSet)); } } @@ -159,8 +158,8 @@ fn extract_lightmaps( if !view_visibility.get() || images.get(&lightmap.image).is_none() || !render_mesh_instances - .get(&entity) - .and_then(|mesh_instance| meshes.get(mesh_instance.mesh_asset_id)) + .mesh_asset_id(entity) + .and_then(|mesh_asset_id| meshes.get(mesh_asset_id)) .is_some_and(|mesh| mesh.layout.0.contains(Mesh::ATTRIBUTE_UV_1.id)) { continue; diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs index 4785c4aed98db..082a07dbb6219 100644 --- a/crates/bevy_pbr/src/material.rs +++ b/crates/bevy_pbr/src/material.rs @@ -508,6 +508,8 @@ pub const fn screen_space_specular_transmission_pipeline_key( } } +/// For each view, iterates over all the meshes visible from that view and adds +/// them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as appropriate. #[allow(clippy::too_many_arguments)] pub fn queue_material_meshes( opaque_draw_functions: Res>, @@ -647,7 +649,8 @@ pub fn queue_material_meshes( let Some(material_asset_id) = render_material_instances.get(visible_entity) else { continue; }; - let Some(mesh_instance) = render_mesh_instances.get(visible_entity) else { + let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(*visible_entity) + else { continue; }; let Some(mesh) = render_meshes.get(mesh_instance.mesh_asset_id) else { @@ -693,8 +696,7 @@ pub fn queue_material_meshes( match material.properties.alpha_mode { AlphaMode::Opaque => { if material.properties.reads_view_transmission_texture { - let distance = rangefinder - .distance_translation(&mesh_instance.transforms.transform.translation) + let distance = rangefinder.distance_translation(&mesh_instance.translation) + material.properties.depth_bias; transmissive_phase.add(Transmissive3d { entity: *visible_entity, @@ -717,8 +719,7 @@ pub fn queue_material_meshes( } AlphaMode::Mask(_) => { if material.properties.reads_view_transmission_texture { - let distance = rangefinder - .distance_translation(&mesh_instance.transforms.transform.translation) + let distance = rangefinder.distance_translation(&mesh_instance.translation) + material.properties.depth_bias; transmissive_phase.add(Transmissive3d { entity: *visible_entity, @@ -746,8 +747,7 @@ pub fn queue_material_meshes( | AlphaMode::Premultiplied | AlphaMode::Add | AlphaMode::Multiply => { - let distance = rangefinder - .distance_translation(&mesh_instance.transforms.transform.translation) + let distance = rangefinder.distance_translation(&mesh_instance.translation) + material.properties.depth_bias; transparent_phase.add(Transparent3d { entity: *visible_entity, diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs index ad954cbc26f43..230b7455d83c8 100644 --- a/crates/bevy_pbr/src/prepass/mod.rs +++ b/crates/bevy_pbr/src/prepass/mod.rs @@ -1,6 +1,5 @@ mod prepass_bindings; -use bevy_render::batching::{batch_and_prepare_binned_render_phase, sort_binned_render_phase}; use bevy_render::mesh::{GpuMesh, MeshVertexBufferLayoutRef}; use bevy_render::render_resource::binding_types::uniform_buffer; pub use prepass_bindings::*; @@ -145,7 +144,11 @@ where update_mesh_previous_global_transforms, update_previous_view_data, ), - ); + ) + .add_plugins(( + BinnedRenderPhasePlugin::::default(), + BinnedRenderPhasePlugin::::default(), + )); } let Some(render_app) = app.get_sub_app_mut(RenderApp) else { @@ -157,18 +160,7 @@ where .add_systems(ExtractSchedule, extract_camera_previous_view_data) .add_systems( Render, - ( - ( - sort_binned_render_phase::, - sort_binned_render_phase:: - ).in_set(RenderSet::PhaseSort), - ( - prepare_previous_view_uniforms, - batch_and_prepare_binned_render_phase::, - batch_and_prepare_binned_render_phase::, - ).in_set(RenderSet::PrepareResources), - ) + prepare_previous_view_uniforms.in_set(RenderSet::PrepareResources), ); } @@ -786,7 +778,8 @@ pub fn queue_prepass_material_meshes( let Some(material_asset_id) = render_material_instances.get(visible_entity) else { continue; }; - let Some(mesh_instance) = render_mesh_instances.get(visible_entity) else { + let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(*visible_entity) + else { continue; }; let Some(material) = render_materials.get(*material_asset_id) else { diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs new file mode 100644 index 0000000000000..21eff19668a65 --- /dev/null +++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs @@ -0,0 +1,299 @@ +//! GPU mesh preprocessing. +//! +//! This is an optional pass that uses a compute shader to reduce the amount of +//! data that has to be transferred from the CPU to the GPU. When enabled, +//! instead of transferring [`MeshUniform`]s to the GPU, we transfer the smaller +//! [`MeshInputUniform`]s instead and use the GPU to calculate the remaining +//! derived fields in [`MeshUniform`]. + +use std::num::NonZeroU64; + +use bevy_app::{App, Plugin}; +use bevy_asset::{load_internal_asset, Handle}; +use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d}; +use bevy_ecs::{ + component::Component, + entity::Entity, + query::QueryState, + schedule::{common_conditions::resource_exists, IntoSystemConfigs as _}, + system::{lifetimeless::Read, Commands, Res, ResMut, Resource}, + world::{FromWorld, World}, +}; +use bevy_render::{ + batching::gpu_preprocessing::{self, BatchedInstanceBuffers, PreprocessWorkItem}, + render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext}, + render_resource::{ + binding_types::{storage_buffer, storage_buffer_read_only}, + BindGroup, BindGroupEntries, BindGroupLayout, BindingResource, BufferBinding, + CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor, + DynamicBindGroupLayoutEntries, PipelineCache, Shader, ShaderStages, ShaderType, + SpecializedComputePipeline, SpecializedComputePipelines, + }, + renderer::{RenderContext, RenderDevice}, + Render, RenderApp, RenderSet, +}; +use bevy_utils::tracing::warn; + +use crate::{graph::NodePbr, MeshInputUniform, MeshUniform}; + +/// The handle to the `mesh_preprocess.wgsl` compute shader. +pub const MESH_PREPROCESS_SHADER_HANDLE: Handle = + Handle::weak_from_u128(16991728318640779533); + +/// The GPU workgroup size. +const WORKGROUP_SIZE: usize = 64; + +/// A plugin that builds mesh uniforms on GPU. +/// +/// This will only be added if the platform supports compute shaders (e.g. not +/// on WebGL 2). +pub struct GpuMeshPreprocessPlugin { + /// Whether we're building [`MeshUniform`]s on GPU. + /// + /// This requires compute shader support and so will be forcibly disabled if + /// the platform doesn't support those. + pub use_gpu_instance_buffer_builder: bool, +} + +/// The render node for the mesh uniform building pass. +pub struct GpuPreprocessNode { + view_query: QueryState<(Entity, Read)>, +} + +/// The compute shader pipeline for the mesh uniform building pass. +#[derive(Resource)] +pub struct PreprocessPipeline { + /// The single bind group layout for the compute shader. + pub bind_group_layout: BindGroupLayout, + /// The pipeline ID for the compute shader. + /// + /// This gets filled in in `prepare_preprocess_pipeline`. + pub pipeline_id: Option, +} + +/// The compute shader bind group for the mesh uniform building pass. +/// +/// This goes on the view. +#[derive(Component)] +pub struct PreprocessBindGroup(BindGroup); + +impl Plugin for GpuMeshPreprocessPlugin { + fn build(&self, app: &mut App) { + load_internal_asset!( + app, + MESH_PREPROCESS_SHADER_HANDLE, + "mesh_preprocess.wgsl", + Shader::from_wgsl + ); + } + + fn finish(&self, app: &mut App) { + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + // This plugin does nothing if GPU instance buffer building isn't in + // use. + let render_device = render_app.world().resource::(); + if !self.use_gpu_instance_buffer_builder + || !gpu_preprocessing::can_preprocess_on_gpu(render_device) + { + return; + } + + // Stitch the node in. + render_app + .add_render_graph_node::(Core3d, NodePbr::GpuPreprocess) + .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, Node3d::Prepass)) + .add_render_graph_edges(Core3d, (NodePbr::GpuPreprocess, NodePbr::ShadowPass)) + .init_resource::() + .init_resource::>() + .add_systems( + Render, + ( + prepare_preprocess_pipeline.in_set(RenderSet::Prepare), + prepare_preprocess_bind_groups + .run_if( + resource_exists::>, + ) + .in_set(RenderSet::PrepareBindGroups), + ) + ); + } +} + +impl FromWorld for GpuPreprocessNode { + fn from_world(world: &mut World) -> Self { + Self { + view_query: QueryState::new(world), + } + } +} + +impl Node for GpuPreprocessNode { + fn update(&mut self, world: &mut World) { + self.view_query.update_archetypes(world); + } + + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + world: &'w World, + ) -> Result<(), NodeRunError> { + // Grab the [`BatchedInstanceBuffers`]. + let BatchedInstanceBuffers { + work_item_buffers: ref index_buffers, + .. + } = world.resource::>(); + + let pipeline_cache = world.resource::(); + let preprocess_pipeline = world.resource::(); + + let Some(preprocess_pipeline_id) = preprocess_pipeline.pipeline_id else { + warn!("The build mesh uniforms pipeline wasn't created"); + return Ok(()); + }; + + let Some(preprocess_pipeline) = pipeline_cache.get_compute_pipeline(preprocess_pipeline_id) + else { + // This will happen while the pipeline is being compiled and is fine. + return Ok(()); + }; + + let mut compute_pass = + render_context + .command_encoder() + .begin_compute_pass(&ComputePassDescriptor { + label: Some("mesh preprocessing"), + timestamp_writes: None, + }); + + compute_pass.set_pipeline(preprocess_pipeline); + + // Run the compute passes. + for (view, bind_group) in self.view_query.iter_manual(world) { + let index_buffer = &index_buffers[&view]; + compute_pass.set_bind_group(0, &bind_group.0, &[]); + let workgroup_count = index_buffer.len().div_ceil(WORKGROUP_SIZE); + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + } + + Ok(()) + } +} + +impl SpecializedComputePipeline for PreprocessPipeline { + type Key = (); + + fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor { + ComputePipelineDescriptor { + label: Some("mesh preprocessing".into()), + layout: vec![self.bind_group_layout.clone()], + push_constant_ranges: vec![], + shader: MESH_PREPROCESS_SHADER_HANDLE, + shader_defs: vec![], + entry_point: "main".into(), + } + } +} + +impl FromWorld for PreprocessPipeline { + fn from_world(world: &mut World) -> Self { + let render_device = world.resource::(); + + let bind_group_layout_entries = DynamicBindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + ( + // `current_input` + storage_buffer_read_only::(false), + // `previous_input` + storage_buffer_read_only::(false), + // `indices` + storage_buffer_read_only::(false), + // `output` + storage_buffer::(false), + ), + ); + + let bind_group_layout = render_device.create_bind_group_layout( + "build mesh uniforms bind group layout", + &bind_group_layout_entries, + ); + + PreprocessPipeline { + bind_group_layout, + pipeline_id: None, + } + } +} + +/// A system that specializes the `mesh_preprocess.wgsl` pipeline if necessary. +pub fn prepare_preprocess_pipeline( + pipeline_cache: Res, + mut pipelines: ResMut>, + mut preprocess_pipeline: ResMut, +) { + if preprocess_pipeline.pipeline_id.is_some() { + return; + } + + let preprocess_pipeline_id = pipelines.specialize(&pipeline_cache, &preprocess_pipeline, ()); + preprocess_pipeline.pipeline_id = Some(preprocess_pipeline_id); +} + +/// A system that attaches the mesh uniform buffers to the bind group for the +/// compute shader. +pub fn prepare_preprocess_bind_groups( + mut commands: Commands, + render_device: Res, + batched_instance_buffers: Res>, + pipeline: Res, +) { + // Grab the `BatchedInstanceBuffers`. + let BatchedInstanceBuffers { + data_buffer: ref data_buffer_vec, + work_item_buffers: ref index_buffers, + current_input_buffer: ref current_input_buffer_vec, + previous_input_buffer: ref previous_input_buffer_vec, + } = batched_instance_buffers.into_inner(); + + let (Some(current_input_buffer), Some(previous_input_buffer), Some(data_buffer)) = ( + current_input_buffer_vec.buffer(), + previous_input_buffer_vec.buffer(), + data_buffer_vec.buffer(), + ) else { + return; + }; + + for (view, index_buffer_vec) in index_buffers { + let Some(index_buffer) = index_buffer_vec.buffer() else { + continue; + }; + + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let index_buffer_size = NonZeroU64::try_from( + index_buffer_vec.len() as u64 * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + commands + .entity(*view) + .insert(PreprocessBindGroup(render_device.create_bind_group( + "preprocess_bind_group", + &pipeline.bind_group_layout, + &BindGroupEntries::sequential(( + current_input_buffer.as_entire_binding(), + previous_input_buffer.as_entire_binding(), + BindingResource::Buffer(BufferBinding { + buffer: index_buffer, + offset: 0, + size: index_buffer_size, + }), + data_buffer.as_entire_binding(), + )), + ))); + } +} diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs index dfc338f2fb3e1..070e1a9d7b99d 100644 --- a/crates/bevy_pbr/src/render/light.rs +++ b/crates/bevy_pbr/src/render/light.rs @@ -1596,6 +1596,9 @@ pub fn prepare_clusters( } } +/// For each shadow cascade, iterates over all the meshes "visible" from it and +/// adds them to [`BinnedRenderPhase`]s or [`SortedRenderPhase`]s as +/// appropriate. #[allow(clippy::too_many_arguments)] pub fn queue_shadows( shadow_draw_functions: Res>, @@ -1651,10 +1654,14 @@ pub fn queue_shadows( light_key.set(MeshPipelineKey::DEPTH_CLAMP_ORTHO, is_directional_light); for entity in visible_entities.iter().copied() { - let Some(mesh_instance) = render_mesh_instances.get(&entity) else { + let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(entity) + else { continue; }; - if !mesh_instance.shadow_caster { + if !mesh_instance + .flags + .contains(RenderMeshInstanceFlags::SHADOW_CASTER) + { continue; } let Some(material_asset_id) = render_material_instances.get(&entity) else { diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index f64fa2b2b5f12..ddc81604f4d13 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -1,3 +1,5 @@ +use std::mem; + use bevy_asset::{load_internal_asset, AssetId}; use bevy_core_pipeline::{ core_3d::{AlphaMask3d, Opaque3d, Transmissive3d, Transparent3d, CORE_3D_DEPTH_FORMAT}, @@ -10,20 +12,22 @@ use bevy_ecs::{ query::ROQueryItem, system::{lifetimeless::*, SystemParamItem, SystemState}, }; -use bevy_math::{Affine3, Rect, UVec2, Vec4}; +use bevy_math::{Affine3, Rect, UVec2, Vec3, Vec4}; use bevy_render::{ batching::{ - batch_and_prepare_binned_render_phase, batch_and_prepare_sorted_render_phase, - sort_binned_render_phase, write_batched_instance_buffer, GetBatchData, GetBinnedBatchData, - NoAutomaticBatching, + clear_batched_instance_buffers, gpu_preprocessing, no_gpu_preprocessing, GetBatchData, + GetFullBatchData, NoAutomaticBatching, }, mesh::*, render_asset::RenderAssets, - render_phase::{PhaseItem, RenderCommand, RenderCommandResult, TrackedRenderPass}, + render_phase::{ + BinnedRenderPhasePlugin, PhaseItem, RenderCommand, RenderCommandResult, + SortedRenderPhasePlugin, TrackedRenderPass, + }, render_resource::*, renderer::{RenderDevice, RenderQueue}, texture::{BevyDefault, DefaultImageSampler, ImageSampler, TextureFormatPixelInfo}, - view::{ViewTarget, ViewUniformOffset, ViewVisibility}, + view::{prepare_view_targets, ViewTarget, ViewUniformOffset, ViewVisibility}, Extract, }; use bevy_transform::components::GlobalTransform; @@ -31,6 +35,8 @@ use bevy_utils::{tracing::error, Entry, HashMap, Parallel}; #[cfg(debug_assertions)] use bevy_utils::warn_once; +use bytemuck::{Pod, Zeroable}; +use nonmax::NonMaxU32; use static_assertions::const_assert_eq; use crate::render::{ @@ -45,8 +51,15 @@ use self::irradiance_volume::IRRADIANCE_VOLUMES_ARE_USABLE; use super::skin::SkinIndices; +/// Provides support for rendering 3D meshes. #[derive(Default)] -pub struct MeshRenderPlugin; +pub struct MeshRenderPlugin { + /// Whether we're building [`MeshUniform`]s on GPU. + /// + /// This requires compute shader support and so will be forcibly disabled if + /// the platform doesn't support those. + pub use_gpu_instance_buffer_builder: bool, +} pub const FORWARD_IO_HANDLE: Handle = Handle::weak_from_u128(2645551199423808407); pub const MESH_VIEW_TYPES_HANDLE: Handle = Handle::weak_from_u128(8140454348013264787); @@ -108,47 +121,32 @@ impl Plugin for MeshRenderPlugin { app.add_systems( PostUpdate, (no_automatic_skin_batching, no_automatic_morph_batching), - ); + ) + .add_plugins(( + BinnedRenderPhasePlugin::::default(), + BinnedRenderPhasePlugin::::default(), + BinnedRenderPhasePlugin::::default(), + BinnedRenderPhasePlugin::::default(), + BinnedRenderPhasePlugin::::default(), + SortedRenderPhasePlugin::::default(), + SortedRenderPhasePlugin::::default(), + )); if let Some(render_app) = app.get_sub_app_mut(RenderApp) { render_app - .init_resource::() .init_resource::() .init_resource::() .init_resource::() .init_resource::() .init_resource::() - .allow_ambiguous_resource::>() + .add_systems(ExtractSchedule, (extract_skins, extract_morphs)) .add_systems( ExtractSchedule, - (extract_meshes, extract_skins, extract_morphs), + clear_batched_instance_buffers::.before(ExtractMeshesSet), ) .add_systems( Render, ( - ( - sort_binned_render_phase::, - sort_binned_render_phase::, - sort_binned_render_phase::, - sort_binned_render_phase::, - sort_binned_render_phase::, - ) - .in_set(RenderSet::PhaseSort), - ( - batch_and_prepare_binned_render_phase::, - batch_and_prepare_sorted_render_phase::, - batch_and_prepare_sorted_render_phase::, - batch_and_prepare_binned_render_phase::, - batch_and_prepare_binned_render_phase::, - batch_and_prepare_binned_render_phase::, - batch_and_prepare_binned_render_phase::< - AlphaMask3dDeferred, - MeshPipeline, - >, - ) - .in_set(RenderSet::PrepareResources), - write_batched_instance_buffer:: - .in_set(RenderSet::PrepareResourcesFlush), prepare_skins.in_set(RenderSet::PrepareResources), prepare_morphs.in_set(RenderSet::PrepareResources), prepare_mesh_bind_group.in_set(RenderSet::PrepareBindGroups), @@ -162,20 +160,59 @@ impl Plugin for MeshRenderPlugin { let mut mesh_bindings_shader_defs = Vec::with_capacity(1); if let Some(render_app) = app.get_sub_app_mut(RenderApp) { - if let Some(per_object_buffer_batch_size) = GpuArrayBuffer::::batch_size( - render_app.world().resource::(), - ) { + let render_device = render_app.world().resource::(); + let use_gpu_instance_buffer_builder = self.use_gpu_instance_buffer_builder + && gpu_preprocessing::can_preprocess_on_gpu(render_device); + + let render_mesh_instances = RenderMeshInstances::new(use_gpu_instance_buffer_builder); + render_app.insert_resource(render_mesh_instances); + + if use_gpu_instance_buffer_builder { + render_app + .init_resource::>( + ) + .add_systems( + ExtractSchedule, + extract_meshes_for_gpu_building.in_set(ExtractMeshesSet), + ) + .add_systems( + Render, + ( + gpu_preprocessing::write_batched_instance_buffers:: + .in_set(RenderSet::PrepareResourcesFlush), + gpu_preprocessing::delete_old_work_item_buffers:: + .in_set(RenderSet::ManageViews) + .after(prepare_view_targets), + ), + ); + } else { + let render_device = render_app.world().resource::(); + let cpu_batched_instance_buffer = + no_gpu_preprocessing::BatchedInstanceBuffer::::new(render_device); + render_app + .insert_resource(cpu_batched_instance_buffer) + .add_systems( + ExtractSchedule, + extract_meshes_for_cpu_building.in_set(ExtractMeshesSet), + ) + .add_systems( + Render, + no_gpu_preprocessing::write_batched_instance_buffer:: + .in_set(RenderSet::PrepareResourcesFlush), + ); + }; + + let render_device = render_app.world().resource::(); + if let Some(per_object_buffer_batch_size) = + GpuArrayBuffer::::batch_size(render_device) + { mesh_bindings_shader_defs.push(ShaderDefVal::UInt( "PER_OBJECT_BUFFER_BATCH_SIZE".into(), per_object_buffer_batch_size, )); } - render_app - .insert_resource(GpuArrayBuffer::::new( - render_app.world().resource::(), - )) - .init_resource::(); + render_app.init_resource::(); } // Load the mesh_bindings shader module here as it depends on runtime information about @@ -221,6 +258,36 @@ pub struct MeshUniform { pub lightmap_uv_rect: UVec2, } +/// Information that has to be transferred from CPU to GPU in order to produce +/// the full [`MeshUniform`]. +/// +/// This is essentially a subset of the fields in [`MeshUniform`] above. +#[derive(ShaderType, Pod, Zeroable, Clone, Copy)] +#[repr(C)] +pub struct MeshInputUniform { + /// Affine 4x3 matrix transposed to 3x4. + pub transform: [Vec4; 3], + /// Four 16-bit unsigned normalized UV values packed into a `UVec2`: + /// + /// ```text + /// <--- MSB LSB ---> + /// +---- min v ----+ +---- min u ----+ + /// lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu, + /// +---- max v ----+ +---- max u ----+ + /// lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU, + /// + /// (MSB: most significant bit; LSB: least significant bit.) + /// ``` + pub lightmap_uv_rect: UVec2, + /// Various [`MeshFlags`]. + pub flags: u32, + /// The index of this mesh's [`MeshInputUniform`] in the previous frame's + /// buffer, if applicable. + /// + /// This is used for TAA. If not present, this will be `u32::MAX`. + pub previous_input_index: u32, +} + impl MeshUniform { pub fn new(mesh_transforms: &MeshTransforms, maybe_lightmap_uv_rect: Option) -> Self { let (inverse_transpose_model_a, inverse_transpose_model_b) = @@ -250,26 +317,263 @@ bitflags::bitflags! { } } -pub struct RenderMeshInstance { +impl MeshFlags { + fn from_components( + transform: &GlobalTransform, + not_shadow_receiver: bool, + transmitted_receiver: bool, + ) -> MeshFlags { + let mut mesh_flags = if not_shadow_receiver { + MeshFlags::empty() + } else { + MeshFlags::SHADOW_RECEIVER + }; + if transmitted_receiver { + mesh_flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER; + } + if transform.affine().matrix3.determinant().is_sign_positive() { + mesh_flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3; + } + + mesh_flags + } +} + +bitflags::bitflags! { + /// Various useful flags for [`RenderMeshInstance`]s. + #[derive(Clone, Copy)] + pub struct RenderMeshInstanceFlags: u8 { + /// The mesh casts shadows. + const SHADOW_CASTER = 1 << 0; + /// The mesh can participate in automatic batching. + const AUTOMATIC_BATCHING = 1 << 1; + /// The mesh had a transform last frame and so is eligible for TAA. + const HAVE_PREVIOUS_TRANSFORM = 1 << 2; + } +} + +/// CPU data that the render world keeps for each entity, when *not* using GPU +/// mesh uniform building. +#[derive(Deref)] +pub struct RenderMeshInstanceCpu { + /// Data shared between both the CPU mesh uniform building and the GPU mesh + /// uniform building paths. + #[deref] + pub shared: RenderMeshInstanceShared, + /// The transform of the mesh. + /// + /// This will be written into the [`MeshUniform`] at the appropriate time. pub transforms: MeshTransforms, +} + +/// CPU data that the render world needs to keep for each entity that contains a +/// mesh when using GPU mesh uniform building. +#[derive(Deref)] +pub struct RenderMeshInstanceGpu { + /// Data shared between both the CPU mesh uniform building and the GPU mesh + /// uniform building paths. + #[deref] + pub shared: RenderMeshInstanceShared, + /// The translation of the mesh. + /// + /// This is the only part of the transform that we have to keep on CPU (for + /// distance sorting). + pub translation: Vec3, + /// The index of the [`MeshInputUniform`] in the buffer. + pub current_uniform_index: NonMaxU32, +} + +/// CPU data that the render world needs to keep about each entity that contains +/// a mesh. +pub struct RenderMeshInstanceShared { + /// The [`AssetId`] of the mesh. pub mesh_asset_id: AssetId, + /// A slot for the material bind group ID. + /// + /// This is filled in during [`crate::material::queue_material_meshes`]. pub material_bind_group_id: AtomicMaterialBindGroupId, - pub shadow_caster: bool, - pub automatic_batching: bool, + /// Various flags. + pub flags: RenderMeshInstanceFlags, } -impl RenderMeshInstance { +/// Information that is gathered during the parallel portion of mesh extraction +/// when GPU mesh uniform building is enabled. +/// +/// From this, the [`MeshInputUniform`] and [`RenderMeshInstanceGpu`] are +/// prepared. +pub struct RenderMeshInstanceGpuBuilder { + /// Data that will be placed on the [`RenderMeshInstanceGpu`]. + pub shared: RenderMeshInstanceShared, + /// The current transform. + pub transform: Affine3, + /// Four 16-bit unsigned normalized UV values packed into a [`UVec2`]: + /// + /// ```text + /// <--- MSB LSB ---> + /// +---- min v ----+ +---- min u ----+ + /// lightmap_uv_rect.x: vvvvvvvv vvvvvvvv uuuuuuuu uuuuuuuu, + /// +---- max v ----+ +---- max u ----+ + /// lightmap_uv_rect.y: VVVVVVVV VVVVVVVV UUUUUUUU UUUUUUUU, + /// + /// (MSB: most significant bit; LSB: least significant bit.) + /// ``` + pub lightmap_uv_rect: UVec2, + /// Various flags. + pub mesh_flags: MeshFlags, +} + +impl RenderMeshInstanceShared { + fn from_components( + previous_transform: Option<&PreviousGlobalTransform>, + handle: &Handle, + not_shadow_caster: bool, + no_automatic_batching: bool, + ) -> Self { + let mut mesh_instance_flags = RenderMeshInstanceFlags::empty(); + mesh_instance_flags.set(RenderMeshInstanceFlags::SHADOW_CASTER, !not_shadow_caster); + mesh_instance_flags.set( + RenderMeshInstanceFlags::AUTOMATIC_BATCHING, + !no_automatic_batching, + ); + mesh_instance_flags.set( + RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM, + previous_transform.is_some(), + ); + + RenderMeshInstanceShared { + mesh_asset_id: handle.id(), + + flags: mesh_instance_flags, + material_bind_group_id: AtomicMaterialBindGroupId::default(), + } + } + + /// Returns true if this entity is eligible to participate in automatic + /// batching. pub fn should_batch(&self) -> bool { - self.automatic_batching && self.material_bind_group_id.get().is_some() + self.flags + .contains(RenderMeshInstanceFlags::AUTOMATIC_BATCHING) + && self.material_bind_group_id.get().is_some() + } +} + +/// Information that the render world keeps about each entity that contains a +/// mesh. +/// +/// The set of information needed is different depending on whether CPU or GPU +/// [`MeshUniform`] building is in use. +#[derive(Resource)] +pub enum RenderMeshInstances { + /// Information needed when using CPU mesh instance data building. + CpuBuilding(RenderMeshInstancesCpu), + /// Information needed when using GPU mesh instance data building. + GpuBuilding(RenderMeshInstancesGpu), +} + +/// Information that the render world keeps about each entity that contains a +/// mesh, when using CPU mesh instance data building. +#[derive(Default, Deref, DerefMut)] +pub struct RenderMeshInstancesCpu(EntityHashMap); + +/// Information that the render world keeps about each entity that contains a +/// mesh, when using GPU mesh instance data building. +#[derive(Default, Deref, DerefMut)] +pub struct RenderMeshInstancesGpu(EntityHashMap); + +impl RenderMeshInstances { + /// Creates a new [`RenderMeshInstances`] instance. + fn new(use_gpu_instance_buffer_builder: bool) -> RenderMeshInstances { + if use_gpu_instance_buffer_builder { + RenderMeshInstances::GpuBuilding(RenderMeshInstancesGpu::default()) + } else { + RenderMeshInstances::CpuBuilding(RenderMeshInstancesCpu::default()) + } + } + + /// Returns the ID of the mesh asset attached to the given entity, if any. + pub(crate) fn mesh_asset_id(&self, entity: Entity) -> Option> { + match *self { + RenderMeshInstances::CpuBuilding(ref instances) => instances.mesh_asset_id(entity), + RenderMeshInstances::GpuBuilding(ref instances) => instances.mesh_asset_id(entity), + } } + + /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a + /// mesh attached. + pub fn render_mesh_queue_data(&self, entity: Entity) -> Option { + match *self { + RenderMeshInstances::CpuBuilding(ref instances) => { + instances.render_mesh_queue_data(entity) + } + RenderMeshInstances::GpuBuilding(ref instances) => { + instances.render_mesh_queue_data(entity) + } + } + } +} + +pub(crate) trait RenderMeshInstancesTable { + /// Returns the ID of the mesh asset attached to the given entity, if any. + fn mesh_asset_id(&self, entity: Entity) -> Option>; + + /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a + /// mesh attached. + fn render_mesh_queue_data(&self, entity: Entity) -> Option; } -#[derive(Default, Resource, Deref, DerefMut)] -pub struct RenderMeshInstances(EntityHashMap); +impl RenderMeshInstancesTable for RenderMeshInstancesCpu { + fn mesh_asset_id(&self, entity: Entity) -> Option> { + self.get(&entity).map(|instance| instance.mesh_asset_id) + } + + fn render_mesh_queue_data(&self, entity: Entity) -> Option { + self.get(&entity).map(|instance| RenderMeshQueueData { + shared: &instance.shared, + translation: instance.transforms.transform.translation, + }) + } +} -pub fn extract_meshes( +impl RenderMeshInstancesTable for RenderMeshInstancesGpu { + /// Returns the ID of the mesh asset attached to the given entity, if any. + fn mesh_asset_id(&self, entity: Entity) -> Option> { + self.get(&entity).map(|instance| instance.mesh_asset_id) + } + + /// Constructs [`RenderMeshQueueData`] for the given entity, if it has a + /// mesh attached. + fn render_mesh_queue_data(&self, entity: Entity) -> Option { + self.get(&entity).map(|instance| RenderMeshQueueData { + shared: &instance.shared, + translation: instance.translation, + }) + } +} + +/// Data that [`crate::material::queue_material_meshes`] and similar systems +/// need in order to place entities that contain meshes in the right batch. +#[derive(Deref)] +pub struct RenderMeshQueueData<'a> { + /// General information about the mesh instance. + #[deref] + pub shared: &'a RenderMeshInstanceShared, + /// The translation of the mesh instance. + pub translation: Vec3, +} + +/// A [`SystemSet`] that encompasses both [`extract_meshes_for_cpu_building`] +/// and [`extract_meshes_for_gpu_building`]. +#[derive(SystemSet, Clone, PartialEq, Eq, Debug, Hash)] +pub struct ExtractMeshesSet; + +/// Extracts meshes from the main world into the render world, populating the +/// [`RenderMeshInstances`]. +/// +/// This is the variant of the system that runs when we're *not* using GPU +/// [`MeshUniform`] building. +pub fn extract_meshes_for_cpu_building( mut render_mesh_instances: ResMut, - mut thread_local_queues: Local>>, + mut render_mesh_instance_queues: Local>>, meshes_query: Extract< Query<( Entity, @@ -299,45 +603,202 @@ pub fn extract_meshes( if !view_visibility.get() { return; } - let transform = transform.affine(); - let previous_transform = previous_transform.map(|t| t.0).unwrap_or(transform); - let mut flags = if not_shadow_receiver { - MeshFlags::empty() - } else { - MeshFlags::SHADOW_RECEIVER - }; - if transmitted_receiver { - flags |= MeshFlags::TRANSMITTED_SHADOW_RECEIVER; - } - if transform.matrix3.determinant().is_sign_positive() { - flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3; - } - let transforms = MeshTransforms { - transform: (&transform).into(), - previous_transform: (&previous_transform).into(), - flags: flags.bits(), - }; - thread_local_queues.scope(|queue| { + + let mesh_flags = + MeshFlags::from_components(transform, not_shadow_receiver, transmitted_receiver); + + let shared = RenderMeshInstanceShared::from_components( + previous_transform, + handle, + not_shadow_caster, + no_automatic_batching, + ); + + render_mesh_instance_queues.scope(|queue| { + let transform = transform.affine(); queue.push(( entity, - RenderMeshInstance { - mesh_asset_id: handle.id(), - transforms, - shadow_caster: !not_shadow_caster, - material_bind_group_id: AtomicMaterialBindGroupId::default(), - automatic_batching: !no_automatic_batching, + RenderMeshInstanceCpu { + transforms: MeshTransforms { + transform: (&transform).into(), + previous_transform: (&previous_transform + .map(|t| t.0) + .unwrap_or(transform)) + .into(), + flags: mesh_flags.bits(), + }, + shared, }, )); }); }, ); + // Collect the render mesh instances. + let RenderMeshInstances::CpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances + else { + panic!( + "`extract_meshes_for_cpu_building` should only be called if we're using CPU \ + `MeshUniform` building" + ); + }; + render_mesh_instances.clear(); - for queue in thread_local_queues.iter_mut() { + for queue in render_mesh_instance_queues.iter_mut() { render_mesh_instances.extend(queue.drain(..)); } } +/// Extracts meshes from the main world into the render world and queues +/// [`MeshInputUniform`]s to be uploaded to the GPU. +/// +/// This is the variant of the system that runs when we're using GPU +/// [`MeshUniform`] building. +pub fn extract_meshes_for_gpu_building( + mut render_mesh_instances: ResMut, + mut batched_instance_buffers: ResMut< + gpu_preprocessing::BatchedInstanceBuffers, + >, + mut render_mesh_instance_queues: Local>>, + mut prev_render_mesh_instances: Local, + meshes_query: Extract< + Query<( + Entity, + &ViewVisibility, + &GlobalTransform, + Option<&PreviousGlobalTransform>, + Option<&Lightmap>, + &Handle, + Has, + Has, + Has, + Has, + )>, + >, +) { + meshes_query.par_iter().for_each( + |( + entity, + view_visibility, + transform, + previous_transform, + lightmap, + handle, + not_shadow_receiver, + transmitted_receiver, + not_shadow_caster, + no_automatic_batching, + )| { + if !view_visibility.get() { + return; + } + + let mesh_flags = + MeshFlags::from_components(transform, not_shadow_receiver, transmitted_receiver); + + let shared = RenderMeshInstanceShared::from_components( + previous_transform, + handle, + not_shadow_caster, + no_automatic_batching, + ); + + let lightmap_uv_rect = + lightmap::pack_lightmap_uv_rect(lightmap.map(|lightmap| lightmap.uv_rect)); + + render_mesh_instance_queues.scope(|queue| { + queue.push(( + entity, + RenderMeshInstanceGpuBuilder { + shared, + transform: (&transform.affine()).into(), + lightmap_uv_rect, + mesh_flags, + }, + )); + }); + }, + ); + + collect_meshes_for_gpu_building( + &mut render_mesh_instances, + &mut batched_instance_buffers, + &mut render_mesh_instance_queues, + &mut prev_render_mesh_instances, + ); +} + +/// Creates the [`RenderMeshInstanceGpu`]s and [`MeshInputUniform`]s when GPU +/// mesh uniforms are built. +fn collect_meshes_for_gpu_building( + render_mesh_instances: &mut RenderMeshInstances, + batched_instance_buffers: &mut gpu_preprocessing::BatchedInstanceBuffers< + MeshUniform, + MeshInputUniform, + >, + render_mesh_instance_queues: &mut Parallel>, + prev_render_mesh_instances: &mut RenderMeshInstancesGpu, +) { + // Collect render mesh instances. Build up the uniform buffer. + let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) = *render_mesh_instances + else { + panic!( + "`collect_render_mesh_instances_for_gpu_building` should only be called if we're \ + using GPU `MeshUniform` building" + ); + }; + + let gpu_preprocessing::BatchedInstanceBuffers { + ref mut current_input_buffer, + ref mut previous_input_buffer, + .. + } = batched_instance_buffers; + + // Swap buffers. + mem::swap(current_input_buffer, previous_input_buffer); + mem::swap(render_mesh_instances, prev_render_mesh_instances); + + // Build the [`RenderMeshInstance`]s and [`MeshInputUniform`]s. + render_mesh_instances.clear(); + for queue in render_mesh_instance_queues.iter_mut() { + for (entity, builder) in queue.drain(..) { + let previous_input_index = if builder + .shared + .flags + .contains(RenderMeshInstanceFlags::HAVE_PREVIOUS_TRANSFORM) + { + prev_render_mesh_instances + .get(&entity) + .map(|render_mesh_instance| render_mesh_instance.current_uniform_index) + } else { + None + }; + + // Push the mesh input uniform. + let current_uniform_index = current_input_buffer.push(MeshInputUniform { + transform: builder.transform.to_transpose(), + lightmap_uv_rect: builder.lightmap_uv_rect, + flags: builder.mesh_flags.bits(), + previous_input_index: match previous_input_index { + Some(previous_input_index) => previous_input_index.into(), + None => u32::MAX, + }, + }) as u32; + + // Record the [`RenderMeshInstance`]. + render_mesh_instances.insert( + entity, + RenderMeshInstanceGpu { + translation: builder.transform.translation, + shared: builder.shared, + current_uniform_index: NonMaxU32::try_from(current_uniform_index) + .unwrap_or_default(), + }, + ); + } + } +} + #[derive(Resource, Clone)] pub struct MeshPipeline { view_layouts: [MeshPipelineViewLayout; MeshPipelineViewLayoutKey::COUNT], @@ -467,6 +928,13 @@ impl GetBatchData for MeshPipeline { (mesh_instances, lightmaps): &SystemParamItem, entity: Entity, ) -> Option<(Self::BufferData, Option)> { + let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else { + error!( + "`get_batch_data` should never be called in GPU mesh uniform \ + building mode" + ); + return None; + }; let mesh_instance = mesh_instances.get(&entity)?; let maybe_lightmap = lightmaps.render_lightmaps.get(&entity); @@ -484,15 +952,45 @@ impl GetBatchData for MeshPipeline { } } -impl GetBinnedBatchData for MeshPipeline { - type Param = (SRes, SRes); +impl GetFullBatchData for MeshPipeline { + type BufferInputData = MeshInputUniform; - type BufferData = MeshUniform; + fn get_index_and_compare_data( + (mesh_instances, lightmaps): &SystemParamItem, + entity: Entity, + ) -> Option<(NonMaxU32, Option)> { + // This should only be called during GPU building. + let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else { + error!( + "`get_index_and_compare_data` should never be called in CPU mesh uniform building \ + mode" + ); + return None; + }; - fn get_batch_data( + let mesh_instance = mesh_instances.get(&entity)?; + let maybe_lightmap = lightmaps.render_lightmaps.get(&entity); + + Some(( + mesh_instance.current_uniform_index, + mesh_instance.should_batch().then_some(( + mesh_instance.material_bind_group_id.get(), + mesh_instance.mesh_asset_id, + maybe_lightmap.map(|lightmap| lightmap.image), + )), + )) + } + + fn get_binned_batch_data( (mesh_instances, lightmaps): &SystemParamItem, entity: Entity, ) -> Option { + let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else { + error!( + "`get_binned_batch_data` should never be called in GPU mesh uniform building mode" + ); + return None; + }; let mesh_instance = mesh_instances.get(&entity)?; let maybe_lightmap = lightmaps.render_lightmaps.get(&entity); @@ -501,6 +999,24 @@ impl GetBinnedBatchData for MeshPipeline { maybe_lightmap.map(|lightmap| lightmap.uv_rect), )) } + + fn get_binned_index( + (mesh_instances, _): &SystemParamItem, + entity: Entity, + ) -> Option { + // This should only be called during GPU building. + let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else { + error!( + "`get_binned_index` should never be called in CPU mesh uniform \ + building mode" + ); + return None; + }; + + mesh_instances + .get(&entity) + .map(|entity| entity.current_uniform_index) + } } bitflags::bitflags! { @@ -1031,16 +1547,32 @@ pub fn prepare_mesh_bind_group( mut groups: ResMut, mesh_pipeline: Res, render_device: Res, - mesh_uniforms: Res>, + cpu_batched_instance_buffer: Option< + Res>, + >, + gpu_batched_instance_buffers: Option< + Res>, + >, skins_uniform: Res, weights_uniform: Res, render_lightmaps: Res, ) { groups.reset(); let layouts = &mesh_pipeline.mesh_layouts; - let Some(model) = mesh_uniforms.binding() else { + + let model = if let Some(cpu_batched_instance_buffer) = cpu_batched_instance_buffer { + cpu_batched_instance_buffer + .into_inner() + .instance_data_binding() + } else if let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers { + gpu_batched_instance_buffers + .into_inner() + .instance_data_binding() + } else { return; }; + let Some(model) = model else { return }; + groups.model_only = Some(layouts.model_only(&render_device, &model)); let skin = skins_uniform.buffer.buffer(); @@ -1140,7 +1672,7 @@ impl RenderCommand

for SetMeshBindGroup { let entity = &item.entity(); - let Some(mesh) = mesh_instances.get(entity) else { + let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(*entity) else { return RenderCommandResult::Success; }; let skin_index = skin_indices.get(entity); @@ -1154,8 +1686,7 @@ impl RenderCommand

for SetMeshBindGroup { .get(entity) .map(|render_lightmap| render_lightmap.image); - let Some(bind_group) = - bind_groups.get(mesh.mesh_asset_id, lightmap, is_skinned, is_morphed) + let Some(bind_group) = bind_groups.get(mesh_asset_id, lightmap, is_skinned, is_morphed) else { error!( "The MeshBindGroups resource wasn't set in the render phase. \ @@ -1187,24 +1718,50 @@ impl RenderCommand

for SetMeshBindGroup { pub struct DrawMesh; impl RenderCommand

for DrawMesh { - type Param = (SRes>, SRes); - type ViewQuery = (); + type Param = ( + SRes>, + SRes, + SRes, + Option>, + ); + type ViewQuery = Has; type ItemQuery = (); #[inline] fn render<'w>( item: &P, - _view: (), + has_preprocess_bind_group: ROQueryItem, _item_query: Option<()>, - (meshes, mesh_instances): SystemParamItem<'w, '_, Self::Param>, + (meshes, mesh_instances, pipeline_cache, preprocess_pipeline): SystemParamItem< + 'w, + '_, + Self::Param, + >, pass: &mut TrackedRenderPass<'w>, ) -> RenderCommandResult { + // If we're using GPU preprocessing, then we're dependent on that + // compute shader having been run, which of course can only happen if + // it's compiled. Otherwise, our mesh instance data won't be present. + if let Some(preprocess_pipeline) = preprocess_pipeline { + if !has_preprocess_bind_group + || !preprocess_pipeline + .pipeline_id + .is_some_and(|preprocess_pipeline_id| { + pipeline_cache + .get_compute_pipeline(preprocess_pipeline_id) + .is_some() + }) + { + return RenderCommandResult::Failure; + } + } + let meshes = meshes.into_inner(); let mesh_instances = mesh_instances.into_inner(); - let Some(mesh_instance) = mesh_instances.get(&item.entity()) else { + let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else { return RenderCommandResult::Failure; }; - let Some(gpu_mesh) = meshes.get(mesh_instance.mesh_asset_id) else { + let Some(gpu_mesh) = meshes.get(mesh_asset_id) else { return RenderCommandResult::Failure; }; diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl new file mode 100644 index 0000000000000..c4adaa5105623 --- /dev/null +++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl @@ -0,0 +1,86 @@ +// GPU mesh uniform building. +// +// This is a compute shader that expands each `MeshInputUniform` out to a full +// `MeshUniform` for each view before rendering. (Thus `MeshInputUniform` +// and `MeshUniform` are in a 1:N relationship.) It runs in parallel for all +// meshes for all views. As part of this process, the shader gathers each +// mesh's transform on the previous frame and writes it into the `MeshUniform` +// so that TAA works. + +#import bevy_pbr::mesh_types::Mesh +#import bevy_render::maths + +// Per-frame data that the CPU supplies to the GPU. +struct MeshInput { + // The model transform. + model: mat3x4, + // The lightmap UV rect, packed into 64 bits. + lightmap_uv_rect: vec2, + // Various flags. + flags: u32, + // The index of this mesh's `MeshInput` in the `previous_input` array, if + // applicable. If not present, this is `u32::MAX`. + previous_input_index: u32, +} + +// One invocation of this compute shader: i.e. one mesh instance in a view. +struct PreprocessWorkItem { + // The index of the `MeshInput` in the `current_input` buffer that we read + // from. + input_index: u32, + // The index of the `Mesh` in `output` that we write to. + output_index: u32, +} + +// The current frame's `MeshInput`. +@group(0) @binding(0) var current_input: array; +// The `MeshInput` values from the previous frame. +@group(0) @binding(1) var previous_input: array; +// Indices into the `MeshInput` buffer. +// +// There may be many indices that map to the same `MeshInput`. +@group(0) @binding(2) var work_items: array; +// The output array of `Mesh`es. +@group(0) @binding(3) var output: array; + +@compute +@workgroup_size(64) +fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { + let instance_index = global_invocation_id.x; + if (instance_index >= arrayLength(&work_items)) { + return; + } + + // Unpack. + let mesh_index = work_items[instance_index].input_index; + let output_index = work_items[instance_index].output_index; + let model_affine_transpose = current_input[mesh_index].model; + let model = maths::affine3_to_square(model_affine_transpose); + + // Calculate inverse transpose. + let inverse_transpose_model = transpose(maths::inverse_affine3(transpose( + model_affine_transpose))); + + // Pack inverse transpose. + let inverse_transpose_model_a = mat2x4( + vec4(inverse_transpose_model[0].xyz, inverse_transpose_model[1].x), + vec4(inverse_transpose_model[1].yz, inverse_transpose_model[2].xy)); + let inverse_transpose_model_b = inverse_transpose_model[2].z; + + // Look up the previous model matrix. + let previous_input_index = current_input[mesh_index].previous_input_index; + var previous_model: mat3x4; + if (previous_input_index == 0xffffffff) { + previous_model = model_affine_transpose; + } else { + previous_model = previous_input[previous_input_index].model; + } + + // Write the output. + output[output_index].model = model_affine_transpose; + output[output_index].previous_model = previous_model; + output[output_index].inverse_transpose_model_a = inverse_transpose_model_a; + output[output_index].inverse_transpose_model_b = inverse_transpose_model_b; + output[output_index].flags = current_input[mesh_index].flags; + output[output_index].lightmap_uv_rect = current_input[mesh_index].lightmap_uv_rect; +} diff --git a/crates/bevy_pbr/src/render/mod.rs b/crates/bevy_pbr/src/render/mod.rs index 7efffc05681f8..53bc9bcde14b4 100644 --- a/crates/bevy_pbr/src/render/mod.rs +++ b/crates/bevy_pbr/src/render/mod.rs @@ -1,4 +1,5 @@ mod fog; +mod gpu_preprocess; mod light; pub(crate) mod mesh; mod mesh_bindings; @@ -7,6 +8,7 @@ mod morph; mod skin; pub use fog::*; +pub use gpu_preprocess::*; pub use light::*; pub use mesh::*; pub use mesh_bindings::MeshLayouts; diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs new file mode 100644 index 0000000000000..8757943419eb6 --- /dev/null +++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs @@ -0,0 +1,305 @@ +//! Batching functionality when GPU preprocessing is in use. + +use std::marker::PhantomData; + +use bevy_ecs::{ + entity::Entity, + query::With, + system::{Query, Res, ResMut, Resource, StaticSystemParam}, +}; +use bevy_encase_derive::ShaderType; +use bevy_utils::EntityHashMap; +use bytemuck::{Pod, Zeroable}; +use smallvec::smallvec; +use wgpu::{BindingResource, BufferUsages}; + +use crate::{ + render_phase::{ + BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem, + SortedPhaseItem, SortedRenderPhase, + }, + render_resource::{BufferVec, GpuArrayBufferIndex, GpuArrayBufferable, UninitBufferVec}, + renderer::{RenderDevice, RenderQueue}, + view::ViewTarget, +}; + +use super::GetFullBatchData; + +/// The GPU buffers holding the data needed to render batches. +/// +/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the +/// `BD` type parameter in that mode. +/// +/// We have a separate *buffer data input* type (`BDI`) here, which a compute +/// shader is expected to expand to the full buffer data (`BD`) type. GPU +/// uniform building is generally faster and uses less system RAM to VRAM bus +/// bandwidth, but only implemented for some pipelines (for example, not in the +/// 2D pipeline at present) and only when compute shader is available. +#[derive(Resource)] +pub struct BatchedInstanceBuffers +where + BD: GpuArrayBufferable + Sync + Send + 'static, + BDI: Pod, +{ + /// A storage area for the buffer data that the GPU compute shader is + /// expected to write to. + /// + /// There will be one entry for each index. + pub data_buffer: UninitBufferVec, + + /// The index of the buffer data in the current input buffer that + /// corresponds to each instance. + /// + /// This is keyed off each view. Each view has a separate buffer. + pub work_item_buffers: EntityHashMap>, + + /// The uniform data inputs for the current frame. + /// + /// These are uploaded during the extraction phase. + pub current_input_buffer: BufferVec, + + /// The uniform data inputs for the previous frame. + /// + /// The indices don't generally line up between `current_input_buffer` + /// and `previous_input_buffer`, because, among other reasons, entities + /// can spawn or despawn between frames. Instead, each current buffer + /// data input uniform is expected to contain the index of the + /// corresponding buffer data input uniform in this list. + pub previous_input_buffer: BufferVec, +} + +/// One invocation of the preprocessing shader: i.e. one mesh instance in a +/// view. +#[derive(Clone, Copy, Pod, Zeroable, ShaderType)] +#[repr(C)] +pub struct PreprocessWorkItem { + /// The index of the batch input data in the input buffer that the shader + /// reads from. + pub input_index: u32, + /// The index of the `MeshUniform` in the output buffer that we write to. + pub output_index: u32, +} + +impl BatchedInstanceBuffers +where + BD: GpuArrayBufferable + Sync + Send + 'static, + BDI: Pod, +{ + /// Creates new buffers. + pub fn new() -> Self { + BatchedInstanceBuffers { + data_buffer: UninitBufferVec::new(BufferUsages::STORAGE), + work_item_buffers: EntityHashMap::default(), + current_input_buffer: BufferVec::new(BufferUsages::STORAGE), + previous_input_buffer: BufferVec::new(BufferUsages::STORAGE), + } + } + + /// Returns the binding of the buffer that contains the per-instance data. + /// + /// This buffer needs to be filled in via a compute shader. + pub fn instance_data_binding(&self) -> Option { + self.data_buffer + .buffer() + .map(|buffer| buffer.as_entire_binding()) + } + + /// Clears out the buffers in preparation for a new frame. + pub fn clear(&mut self) { + self.data_buffer.clear(); + self.current_input_buffer.clear(); + self.previous_input_buffer.clear(); + for work_item_buffer in self.work_item_buffers.values_mut() { + work_item_buffer.clear(); + } + } +} + +impl Default for BatchedInstanceBuffers +where + BD: GpuArrayBufferable + Sync + Send + 'static, + BDI: Pod, +{ + fn default() -> Self { + Self::new() + } +} + +/// A system that removes GPU preprocessing work item buffers that correspond to +/// deleted [`ViewTarget`]s. +/// +/// This is a separate system from [`super::clear_batched_instance_buffers`] +/// because [`ViewTarget`]s aren't created until after the extraction phase is +/// completed. +pub fn delete_old_work_item_buffers( + mut gpu_batched_instance_buffers: ResMut< + BatchedInstanceBuffers, + >, + view_targets: Query>, +) where + GFBD: GetFullBatchData, +{ + gpu_batched_instance_buffers + .work_item_buffers + .retain(|entity, _| view_targets.contains(*entity)); +} + +/// Batch the items in a sorted render phase, when GPU instance buffer building +/// is in use. This means comparing metadata needed to draw each phase item and +/// trying to combine the draws into a batch. +pub fn batch_and_prepare_sorted_render_phase( + gpu_batched_instance_buffers: ResMut< + BatchedInstanceBuffers, + >, + mut views: Query<(Entity, &mut SortedRenderPhase)>, + param: StaticSystemParam, +) where + I: CachedRenderPipelinePhaseItem + SortedPhaseItem, + GFBD: GetFullBatchData, +{ + let system_param_item = param.into_inner(); + + // We only process GPU-built batch data in this function. + let BatchedInstanceBuffers { + ref mut data_buffer, + ref mut work_item_buffers, + .. + } = gpu_batched_instance_buffers.into_inner(); + + for (view, mut phase) in &mut views { + // Create the work item buffer if necessary. + let work_item_buffer = work_item_buffers + .entry(view) + .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE)); + + super::batch_and_prepare_sorted_render_phase::(&mut phase, |item| { + let (input_index, compare_data) = + GFBD::get_index_and_compare_data(&system_param_item, item.entity())?; + let output_index = data_buffer.add() as u32; + + work_item_buffer.push(PreprocessWorkItem { + input_index: input_index.into(), + output_index, + }); + + *item.batch_range_mut() = output_index..output_index + 1; + + compare_data + }); + } +} + +/// Creates batches for a render phase that uses bins. +pub fn batch_and_prepare_binned_render_phase( + gpu_batched_instance_buffers: ResMut< + BatchedInstanceBuffers, + >, + mut views: Query<(Entity, &mut BinnedRenderPhase)>, + param: StaticSystemParam, +) where + BPI: BinnedPhaseItem, + GFBD: GetFullBatchData, +{ + let system_param_item = param.into_inner(); + + let BatchedInstanceBuffers { + ref mut data_buffer, + ref mut work_item_buffers, + .. + } = gpu_batched_instance_buffers.into_inner(); + + for (view, mut phase) in &mut views { + let phase = &mut *phase; // Borrow checker. + + // Create the work item buffer if necessary; otherwise, just mark it as + // used this frame. + let work_item_buffer = work_item_buffers + .entry(view) + .or_insert_with(|| BufferVec::new(BufferUsages::STORAGE)); + + // Prepare batchables. + + for key in &phase.batchable_keys { + let mut batch: Option = None; + for &entity in &phase.batchable_values[key] { + let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else { + continue; + }; + let output_index = data_buffer.add() as u32; + + work_item_buffer.push(PreprocessWorkItem { + input_index: input_index.into(), + output_index, + }); + + batch + .get_or_insert(BinnedRenderPhaseBatch { + representative_entity: entity, + instance_range: output_index..output_index, + dynamic_offset: None, + }) + .instance_range + .end = output_index + 1; + } + + if let Some(batch) = batch { + phase.batch_sets.push(smallvec![batch]); + } + } + + // Prepare unbatchables. + for key in &phase.unbatchable_keys { + let unbatchables = phase.unbatchable_values.get_mut(key).unwrap(); + for &entity in &unbatchables.entities { + let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else { + continue; + }; + let output_index = data_buffer.add() as u32; + + work_item_buffer.push(PreprocessWorkItem { + input_index: input_index.into(), + output_index, + }); + + unbatchables + .buffer_indices + .add(GpuArrayBufferIndex:: { + index: output_index, + dynamic_offset: None, + element_type: PhantomData, + }); + } + } + } +} + +/// A system that writes all instance buffers to the GPU. +pub fn write_batched_instance_buffers( + render_device: Res, + render_queue: Res, + mut gpu_batched_instance_buffers: ResMut< + BatchedInstanceBuffers, + >, +) where + GFBD: GetFullBatchData, +{ + gpu_batched_instance_buffers + .data_buffer + .write_buffer(&render_device); + gpu_batched_instance_buffers + .current_input_buffer + .write_buffer(&render_device, &render_queue); + // There's no need to write `previous_input_buffer`, as we wrote + // that on the previous frame, and it hasn't changed. + + for work_item_buffer in gpu_batched_instance_buffers.work_item_buffers.values_mut() { + work_item_buffer.write_buffer(&render_device, &render_queue); + } +} + +/// Determines whether it's possible to run preprocessing on the GPU. +/// +/// Currently, this simply checks to see whether compute shaders are supported. +pub fn can_preprocess_on_gpu(render_device: &RenderDevice) -> bool { + render_device.limits().max_compute_workgroup_size_x > 0 +} diff --git a/crates/bevy_render/src/batching/mod.rs b/crates/bevy_render/src/batching/mod.rs index 576bf641fb71f..6811451f37dbd 100644 --- a/crates/bevy_render/src/batching/mod.rs +++ b/crates/bevy_render/src/batching/mod.rs @@ -1,21 +1,22 @@ use bevy_ecs::{ component::Component, entity::Entity, - prelude::Res, - system::{Query, ResMut, StaticSystemParam, SystemParam, SystemParamItem}, + system::{Query, ResMut, SystemParam, SystemParamItem}, }; +use bytemuck::Pod; use nonmax::NonMaxU32; -use smallvec::{smallvec, SmallVec}; use crate::{ render_phase::{ - BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem, - DrawFunctionId, SortedPhaseItem, SortedRenderPhase, + BinnedPhaseItem, BinnedRenderPhase, CachedRenderPipelinePhaseItem, DrawFunctionId, + SortedPhaseItem, SortedRenderPhase, }, - render_resource::{CachedRenderPipelineId, GpuArrayBuffer, GpuArrayBufferable}, - renderer::{RenderDevice, RenderQueue}, + render_resource::{CachedRenderPipelineId, GpuArrayBufferable}, }; +pub mod gpu_preprocessing; +pub mod no_gpu_preprocessing; + /// Add this component to mesh entities to disable automatic batching #[derive(Component)] pub struct NoAutomaticBatching; @@ -59,6 +60,10 @@ impl BatchMeta { /// A trait to support getting data used for batching draw commands via phase /// items. +/// +/// This is a simple version that only allows for sorting, not binning, as well +/// as only CPU processing, not GPU preprocessing. For these fancier features, +/// see [`GetFullBatchData`]. pub trait GetBatchData { /// The system parameters [`GetBatchData::get_batch_data`] needs in /// order to compute the batch data. @@ -67,79 +72,90 @@ pub trait GetBatchData { /// function id, per-instance data buffer dynamic offset and this data /// matches, the draws can be batched. type CompareData: PartialEq; - /// The per-instance data to be inserted into the [`GpuArrayBuffer`] - /// containing these data for all instances. + /// The per-instance data to be inserted into the + /// [`crate::render_resource::GpuArrayBuffer`] containing these data for all + /// instances. type BufferData: GpuArrayBufferable + Sync + Send + 'static; - /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`]. - /// If the instance can be batched, also return the data used for - /// comparison when deciding whether draws can be batched, else return None - /// for the `CompareData`. + /// Get the per-instance data to be inserted into the + /// [`crate::render_resource::GpuArrayBuffer`]. If the instance can be + /// batched, also return the data used for comparison when deciding whether + /// draws can be batched, else return None for the `CompareData`. + /// + /// This is only called when building instance data on CPU. In the GPU + /// instance data building path, we use + /// [`GetFullBatchData::get_index_and_compare_data`] instead. fn get_batch_data( param: &SystemParamItem, query_item: Entity, ) -> Option<(Self::BufferData, Option)>; } -/// When implemented on a pipeline, this trait allows the batching logic to -/// compute the per-batch data that will be uploaded to the GPU. +/// A trait to support getting data used for batching draw commands via phase +/// items. /// -/// This includes things like the mesh transforms. -pub trait GetBinnedBatchData { - /// The system parameters [`GetBinnedBatchData::get_batch_data`] needs - /// in order to compute the batch data. - type Param: SystemParam + 'static; - /// The per-instance data to be inserted into the [`GpuArrayBuffer`] - /// containing these data for all instances. - type BufferData: GpuArrayBufferable + Sync + Send + 'static; - - /// Get the per-instance data to be inserted into the [`GpuArrayBuffer`]. - fn get_batch_data( +/// This version allows for binning and GPU preprocessing. +pub trait GetFullBatchData: GetBatchData { + /// The per-instance data that was inserted into the + /// [`crate::render_resource::BufferVec`] during extraction. + type BufferInputData: Pod + Sync + Send; + + /// Get the per-instance data to be inserted into the + /// [`crate::render_resource::GpuArrayBuffer`]. + /// + /// This is only called when building uniforms on CPU. In the GPU instance + /// buffer building path, we use + /// [`GetFullBatchData::get_index_and_compare_data`] instead. + fn get_binned_batch_data( param: &SystemParamItem, - entity: Entity, + query_item: Entity, ) -> Option; + + /// Returns the index of the [`GetFullBatchData::BufferInputData`] that the + /// GPU preprocessing phase will use. + /// + /// We already inserted the [`GetFullBatchData::BufferInputData`] during the + /// extraction phase before we got here, so this function shouldn't need to + /// look up any render data. If CPU instance buffer building is in use, this + /// function will never be called. + fn get_index_and_compare_data( + param: &SystemParamItem, + query_item: Entity, + ) -> Option<(NonMaxU32, Option)>; + + /// Returns the index of the [`GetFullBatchData::BufferInputData`] that the + /// GPU preprocessing phase will use, for the binning path. + /// + /// We already inserted the [`GetFullBatchData::BufferInputData`] during the + /// extraction phase before we got here, so this function shouldn't need to + /// look up any render data. If CPU instance buffer building is in use, this + /// function will never be called. + fn get_binned_index( + param: &SystemParamItem, + query_item: Entity, + ) -> Option; } -/// Batch the items in a sorted render phase. This means comparing metadata -/// needed to draw each phase item and trying to combine the draws into a batch. -pub fn batch_and_prepare_sorted_render_phase( - gpu_array_buffer: ResMut>, - mut views: Query<&mut SortedRenderPhase>, - param: StaticSystemParam, +/// A system that runs early in extraction and clears out all the +/// [`gpu_preprocessing::BatchedInstanceBuffers`] for the frame. +/// +/// We have to run this during extraction because, if GPU preprocessing is in +/// use, the extraction phase will write to the mesh input uniform buffers +/// directly, so the buffers need to be cleared before then. +pub fn clear_batched_instance_buffers( + cpu_batched_instance_buffer: Option< + ResMut>, + >, + gpu_batched_instance_buffers: Option< + ResMut>, + >, ) where - I: CachedRenderPipelinePhaseItem + SortedPhaseItem, - F: GetBatchData, + GFBD: GetFullBatchData, { - let gpu_array_buffer = gpu_array_buffer.into_inner(); - let system_param_item = param.into_inner(); - - let mut process_item = |item: &mut I| { - let (buffer_data, compare_data) = F::get_batch_data(&system_param_item, item.entity())?; - let buffer_index = gpu_array_buffer.push(buffer_data); - - let index = buffer_index.index; - *item.batch_range_mut() = index..index + 1; - *item.dynamic_offset_mut() = buffer_index.dynamic_offset; - - if I::AUTOMATIC_BATCHING { - compare_data.map(|compare_data| BatchMeta::new(item, compare_data)) - } else { - None - } - }; - - for mut phase in &mut views { - let items = phase.items.iter_mut().map(|item| { - let batch_data = process_item(item); - (item.batch_range_mut(), batch_data) - }); - items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| { - if batch_meta.is_some() && prev_batch_meta == batch_meta { - start_range.end = range.end; - (start_range, prev_batch_meta) - } else { - (range, batch_meta) - } - }); + if let Some(mut cpu_batched_instance_buffer) = cpu_batched_instance_buffer { + cpu_batched_instance_buffer.clear(); + } + if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers { + gpu_batched_instance_buffers.clear(); } } @@ -154,75 +170,35 @@ where } } -/// Creates batches for a render phase that uses bins. -pub fn batch_and_prepare_binned_render_phase( - gpu_array_buffer: ResMut>, - mut views: Query<&mut BinnedRenderPhase>, - param: StaticSystemParam, +/// Batches the items in a sorted render phase. +/// +/// This means comparing metadata needed to draw each phase item and trying to +/// combine the draws into a batch. +/// +/// This is common code factored out from +/// [`gpu_preprocessing::batch_and_prepare_sorted_render_phase`] and +/// [`no_gpu_preprocessing::batch_and_prepare_sorted_render_phase`]. +fn batch_and_prepare_sorted_render_phase( + phase: &mut SortedRenderPhase, + mut process_item: impl FnMut(&mut I) -> Option, ) where - BPI: BinnedPhaseItem, - GBBD: GetBinnedBatchData, + I: CachedRenderPipelinePhaseItem + SortedPhaseItem, + GBD: GetBatchData, { - let gpu_array_buffer = gpu_array_buffer.into_inner(); - let system_param_item = param.into_inner(); - - for mut phase in &mut views { - let phase = &mut *phase; // Borrow checker. - - // Prepare batchables. - - for key in &phase.batchable_keys { - let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![]; - for &entity in &phase.batchable_values[key] { - let Some(buffer_data) = GBBD::get_batch_data(&system_param_item, entity) else { - continue; - }; - - let instance = gpu_array_buffer.push(buffer_data); - - // If the dynamic offset has changed, flush the batch. - // - // This is the only time we ever have more than one batch per - // bin. Note that dynamic offsets are only used on platforms - // with no storage buffers. - if !batch_set.last().is_some_and(|batch| { - batch.instance_range.end == instance.index - && batch.dynamic_offset == instance.dynamic_offset - }) { - batch_set.push(BinnedRenderPhaseBatch { - representative_entity: entity, - instance_range: instance.index..instance.index, - dynamic_offset: instance.dynamic_offset, - }); - } - - if let Some(batch) = batch_set.last_mut() { - batch.instance_range.end = instance.index + 1; - } - } - - phase.batch_sets.push(batch_set); - } - - // Prepare unbatchables. - for key in &phase.unbatchable_keys { - let unbatchables = phase.unbatchable_values.get_mut(key).unwrap(); - for &entity in &unbatchables.entities { - if let Some(buffer_data) = GBBD::get_batch_data(&system_param_item, entity) { - let instance = gpu_array_buffer.push(buffer_data); - unbatchables.buffer_indices.add(instance); - } - } + let items = phase.items.iter_mut().map(|item| { + let batch_data = match process_item(item) { + Some(compare_data) if I::AUTOMATIC_BATCHING => Some(BatchMeta::new(item, compare_data)), + _ => None, + }; + (item.batch_range_mut(), batch_data) + }); + + items.reduce(|(start_range, prev_batch_meta), (range, batch_meta)| { + if batch_meta.is_some() && prev_batch_meta == batch_meta { + start_range.end = range.end; + (start_range, prev_batch_meta) + } else { + (range, batch_meta) } - } -} - -pub fn write_batched_instance_buffer( - render_device: Res, - render_queue: Res, - gpu_array_buffer: ResMut>, -) { - let gpu_array_buffer = gpu_array_buffer.into_inner(); - gpu_array_buffer.write_buffer(&render_device, &render_queue); - gpu_array_buffer.clear(); + }); } diff --git a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs new file mode 100644 index 0000000000000..429fe5bb4542e --- /dev/null +++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs @@ -0,0 +1,151 @@ +//! Batching functionality when GPU preprocessing isn't in use. + +use bevy_derive::{Deref, DerefMut}; +use bevy_ecs::system::{Query, Res, ResMut, Resource, StaticSystemParam}; +use smallvec::{smallvec, SmallVec}; +use wgpu::BindingResource; + +use crate::{ + render_phase::{ + BinnedPhaseItem, BinnedRenderPhase, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem, + SortedPhaseItem, SortedRenderPhase, + }, + render_resource::{GpuArrayBuffer, GpuArrayBufferable}, + renderer::{RenderDevice, RenderQueue}, +}; + +use super::{GetBatchData, GetFullBatchData}; + +/// The GPU buffers holding the data needed to render batches. +/// +/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the +/// `BD` type parameter in that mode. +#[derive(Resource, Deref, DerefMut)] +pub struct BatchedInstanceBuffer(pub GpuArrayBuffer) +where + BD: GpuArrayBufferable + Sync + Send + 'static; + +impl BatchedInstanceBuffer +where + BD: GpuArrayBufferable + Sync + Send + 'static, +{ + /// Creates a new buffer. + pub fn new(render_device: &RenderDevice) -> Self { + BatchedInstanceBuffer(GpuArrayBuffer::new(render_device)) + } + + /// Returns the binding of the buffer that contains the per-instance data. + /// + /// If we're in the GPU instance buffer building mode, this buffer needs to + /// be filled in via a compute shader. + pub fn instance_data_binding(&self) -> Option { + self.binding() + } +} + +/// Batch the items in a sorted render phase, when GPU instance buffer building +/// isn't in use. This means comparing metadata needed to draw each phase item +/// and trying to combine the draws into a batch. +pub fn batch_and_prepare_sorted_render_phase( + batched_instance_buffer: ResMut>, + mut views: Query<&mut SortedRenderPhase>, + param: StaticSystemParam, +) where + I: CachedRenderPipelinePhaseItem + SortedPhaseItem, + GBD: GetBatchData, +{ + let system_param_item = param.into_inner(); + + // We only process CPU-built batch data in this function. + let batched_instance_buffer = batched_instance_buffer.into_inner(); + + for mut phase in &mut views { + super::batch_and_prepare_sorted_render_phase::(&mut phase, |item| { + let (buffer_data, compare_data) = + GBD::get_batch_data(&system_param_item, item.entity())?; + let buffer_index = batched_instance_buffer.push(buffer_data); + + let index = buffer_index.index; + *item.batch_range_mut() = index..index + 1; + *item.dynamic_offset_mut() = buffer_index.dynamic_offset; + + compare_data + }); + } +} + +/// Creates batches for a render phase that uses bins, when GPU batch data +/// building isn't in use. +pub fn batch_and_prepare_binned_render_phase( + mut buffer: ResMut>, + mut views: Query<&mut BinnedRenderPhase>, + param: StaticSystemParam, +) where + BPI: BinnedPhaseItem, + GFBD: GetFullBatchData, +{ + let system_param_item = param.into_inner(); + + for mut phase in &mut views { + let phase = &mut *phase; // Borrow checker. + + // Prepare batchables. + + for key in &phase.batchable_keys { + let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![]; + for &entity in &phase.batchable_values[key] { + let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity) + else { + continue; + }; + let instance = buffer.push(buffer_data); + + // If the dynamic offset has changed, flush the batch. + // + // This is the only time we ever have more than one batch per + // bin. Note that dynamic offsets are only used on platforms + // with no storage buffers. + if !batch_set.last().is_some_and(|batch| { + batch.instance_range.end == instance.index + && batch.dynamic_offset == instance.dynamic_offset + }) { + batch_set.push(BinnedRenderPhaseBatch { + representative_entity: entity, + instance_range: instance.index..instance.index, + dynamic_offset: instance.dynamic_offset, + }); + } + + if let Some(batch) = batch_set.last_mut() { + batch.instance_range.end = instance.index + 1; + } + } + + phase.batch_sets.push(batch_set); + } + + // Prepare unbatchables. + for key in &phase.unbatchable_keys { + let unbatchables = phase.unbatchable_values.get_mut(key).unwrap(); + for &entity in &unbatchables.entities { + let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity) + else { + continue; + }; + let instance = buffer.push(buffer_data); + unbatchables.buffer_indices.add(instance); + } + } + } +} + +/// Writes the instance buffer data to the GPU. +pub fn write_batched_instance_buffer( + render_device: Res, + render_queue: Res, + mut cpu_batched_instance_buffer: ResMut>, +) where + GBD: GetBatchData, +{ + cpu_batched_instance_buffer.write_buffer(&render_device, &render_queue); +} diff --git a/crates/bevy_render/src/maths.wgsl b/crates/bevy_render/src/maths.wgsl index 17d045154a317..53c254f3746f8 100644 --- a/crates/bevy_render/src/maths.wgsl +++ b/crates/bevy_render/src/maths.wgsl @@ -27,3 +27,27 @@ fn mat2x4_f32_to_mat3x3_unpack( vec3(a[1].zw, b), ); } + +// Extracts the square portion of an affine matrix: i.e. discards the +// translation. +fn affine3_to_mat3x3(affine: mat4x3) -> mat3x3 { + return mat3x3(affine[0].xyz, affine[1].xyz, affine[2].xyz); +} + +// Returns the inverse of a 3x3 matrix. +fn inverse_mat3x3(matrix: mat3x3) -> mat3x3 { + let tmp0 = cross(matrix[1], matrix[2]); + let tmp1 = cross(matrix[2], matrix[0]); + let tmp2 = cross(matrix[0], matrix[1]); + let inv_det = 1.0 / dot(matrix[2], tmp2); + return transpose(mat3x3(tmp0 * inv_det, tmp1 * inv_det, tmp2 * inv_det)); +} + +// Returns the inverse of an affine matrix. +// +// https://en.wikipedia.org/wiki/Affine_transformation#Groups +fn inverse_affine3(affine: mat4x3) -> mat4x3 { + let matrix3 = affine3_to_mat3x3(affine); + let inv_matrix3 = inverse_mat3x3(matrix3); + return mat4x3(inv_matrix3[0], inv_matrix3[1], inv_matrix3[2], -(inv_matrix3 * affine[3])); +} diff --git a/crates/bevy_render/src/render_phase/mod.rs b/crates/bevy_render/src/render_phase/mod.rs index 40c4153f3fde2..58e1fa550f4e2 100644 --- a/crates/bevy_render/src/render_phase/mod.rs +++ b/crates/bevy_render/src/render_phase/mod.rs @@ -10,11 +10,10 @@ //! //! To draw an entity, a corresponding [`PhaseItem`] has to be added to one or multiple of these //! render phases for each view that it is visible in. -//! This must be done in the [`RenderSet::Queue`](crate::RenderSet::Queue). -//! After that the render phase sorts them in the -//! [`RenderSet::PhaseSort`](crate::RenderSet::PhaseSort). -//! Finally the items are rendered using a single [`TrackedRenderPass`], during the -//! [`RenderSet::Render`](crate::RenderSet::Render). +//! This must be done in the [`RenderSet::Queue`]. +//! After that the render phase sorts them in the [`RenderSet::PhaseSort`]. +//! Finally the items are rendered using a single [`TrackedRenderPass`], during +//! the [`RenderSet::Render`]. //! //! Therefore each phase item is assigned a [`Draw`] function. //! These set up the state of the [`TrackedRenderPass`] (i.e. select the @@ -29,6 +28,7 @@ mod draw; mod draw_state; mod rangefinder; +use bevy_app::{App, Plugin}; use bevy_utils::{default, hashbrown::hash_map::Entry, HashMap}; pub use draw::*; pub use draw_state::*; @@ -36,13 +36,22 @@ use encase::{internal::WriteInto, ShaderSize}; use nonmax::NonMaxU32; pub use rangefinder::*; -use crate::render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache}; +use crate::{ + batching::{ + self, + gpu_preprocessing::{self, BatchedInstanceBuffers}, + no_gpu_preprocessing::{self, BatchedInstanceBuffer}, + GetFullBatchData, + }, + render_resource::{CachedRenderPipelineId, GpuArrayBufferIndex, PipelineCache}, + Render, RenderApp, RenderSet, +}; use bevy_ecs::{ prelude::*, system::{lifetimeless::SRes, SystemParamItem}, }; use smallvec::SmallVec; -use std::{hash::Hash, ops::Range, slice::SliceIndex}; +use std::{hash::Hash, marker::PhantomData, ops::Range, slice::SliceIndex}; /// A collection of all rendering instructions, that will be executed by the GPU, for a /// single render phase for a single view. @@ -291,6 +300,101 @@ where } } +/// A convenient abstraction for adding all the systems necessary for a binned +/// render phase to the render app. +/// +/// This is the version used when the pipeline supports GPU preprocessing: e.g. +/// 3D PBR meshes. +pub struct BinnedRenderPhasePlugin(PhantomData<(BPI, GFBD)>) +where + BPI: BinnedPhaseItem, + GFBD: GetFullBatchData; + +impl Default for BinnedRenderPhasePlugin +where + BPI: BinnedPhaseItem, + GFBD: GetFullBatchData, +{ + fn default() -> Self { + Self(PhantomData) + } +} + +impl Plugin for BinnedRenderPhasePlugin +where + BPI: BinnedPhaseItem, + GFBD: GetFullBatchData + Sync + Send + 'static, +{ + fn build(&self, app: &mut App) { + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + render_app.add_systems( + Render, + ( + batching::sort_binned_render_phase::.in_set(RenderSet::PhaseSort), + ( + no_gpu_preprocessing::batch_and_prepare_binned_render_phase:: + .run_if(resource_exists::>), + gpu_preprocessing::batch_and_prepare_binned_render_phase::.run_if( + resource_exists::< + BatchedInstanceBuffers, + >, + ), + ) + .in_set(RenderSet::PrepareResources), + ), + ); + } +} + +/// A convenient abstraction for adding all the systems necessary for a sorted +/// render phase to the render app. +/// +/// This is the version used when the pipeline supports GPU preprocessing: e.g. +/// 3D PBR meshes. +pub struct SortedRenderPhasePlugin(PhantomData<(SPI, GFBD)>) +where + SPI: SortedPhaseItem, + GFBD: GetFullBatchData; + +impl Default for SortedRenderPhasePlugin +where + SPI: SortedPhaseItem, + GFBD: GetFullBatchData, +{ + fn default() -> Self { + Self(PhantomData) + } +} + +impl Plugin for SortedRenderPhasePlugin +where + SPI: SortedPhaseItem + CachedRenderPipelinePhaseItem, + GFBD: GetFullBatchData + Sync + Send + 'static, +{ + fn build(&self, app: &mut App) { + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + render_app.add_systems( + Render, + ( + no_gpu_preprocessing::batch_and_prepare_sorted_render_phase:: + .run_if(resource_exists::>), + gpu_preprocessing::batch_and_prepare_sorted_render_phase::.run_if( + resource_exists::< + BatchedInstanceBuffers, + >, + ), + ) + .in_set(RenderSet::PrepareResources), + ); + } +} + impl UnbatchableBinnedEntityBufferIndex { /// Adds a new entity to the list of unbatchable binned entities. pub fn add(&mut self, gpu_array_buffer_index: GpuArrayBufferIndex) @@ -463,12 +567,10 @@ where /// /// The data required for rendering an entity is extracted from the main world in the /// [`ExtractSchedule`](crate::ExtractSchedule). -/// Then it has to be queued up for rendering during the -/// [`RenderSet::Queue`](crate::RenderSet::Queue), by adding a corresponding phase item to -/// a render phase. +/// Then it has to be queued up for rendering during the [`RenderSet::Queue`], +/// by adding a corresponding phase item to a render phase. /// Afterwards it will be possibly sorted and rendered automatically in the -/// [`RenderSet::PhaseSort`](crate::RenderSet::PhaseSort) and -/// [`RenderSet::Render`](crate::RenderSet::Render), respectively. +/// [`RenderSet::PhaseSort`] and [`RenderSet::Render`], respectively. /// /// `PhaseItem`s come in two flavors: [`BinnedPhaseItem`]s and /// [`SortedPhaseItem`]s. diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs index f330d52dea50a..8a0f77daafb3d 100644 --- a/crates/bevy_render/src/render_resource/buffer_vec.rs +++ b/crates/bevy_render/src/render_resource/buffer_vec.rs @@ -1,3 +1,5 @@ +use std::marker::PhantomData; + use crate::{ render_resource::Buffer, renderer::{RenderDevice, RenderQueue}, @@ -5,6 +7,8 @@ use crate::{ use bytemuck::{must_cast_slice, NoUninit}; use wgpu::BufferUsages; +use super::GpuArrayBufferable; + /// A structure for storing raw bytes that have already been properly formatted /// for use by the GPU. /// @@ -160,3 +164,96 @@ impl Extend for BufferVec { self.values.extend(iter); } } + +/// Like a [`BufferVec`], but only reserves space on the GPU for elements +/// instead of initializing them CPU-side. +/// +/// This type is useful when you're accumulating "output slots" for a GPU +/// compute shader to write into. +/// +/// The type `T` need not be [`NoUninit`], unlike [`BufferVec`]; it only has to +/// be [`GpuArrayBufferable`]. +pub struct UninitBufferVec +where + T: GpuArrayBufferable, +{ + buffer: Option, + len: usize, + capacity: usize, + item_size: usize, + buffer_usage: BufferUsages, + label: Option, + label_changed: bool, + phantom: PhantomData, +} + +impl UninitBufferVec +where + T: GpuArrayBufferable, +{ + /// Creates a new [`UninitBufferVec`] with the given [`BufferUsages`]. + pub const fn new(buffer_usage: BufferUsages) -> Self { + Self { + len: 0, + buffer: None, + capacity: 0, + item_size: std::mem::size_of::(), + buffer_usage, + label: None, + label_changed: false, + phantom: PhantomData, + } + } + + /// Returns the buffer, if allocated. + #[inline] + pub fn buffer(&self) -> Option<&Buffer> { + self.buffer.as_ref() + } + + /// Reserves space for one more element in the buffer and returns its index. + pub fn add(&mut self) -> usize { + let index = self.len; + self.len += 1; + index + } + + /// Returns true if no elements have been added to this [`UninitBufferVec`]. + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Removes all elements from the buffer. + pub fn clear(&mut self) { + self.len = 0; + } + + /// Materializes the buffer on the GPU with space for `capacity` elements. + /// + /// If the buffer is already big enough, this function doesn't reallocate + /// the buffer. + pub fn reserve(&mut self, capacity: usize, device: &RenderDevice) { + if capacity <= self.capacity && !self.label_changed { + return; + } + + self.capacity = capacity; + let size = self.item_size * capacity; + self.buffer = Some(device.create_buffer(&wgpu::BufferDescriptor { + label: self.label.as_deref(), + size: size as wgpu::BufferAddress, + usage: BufferUsages::COPY_DST | self.buffer_usage, + mapped_at_creation: false, + })); + + self.label_changed = false; + } + + /// Materializes the buffer on the GPU, with an appropriate size for the + /// elements that have been pushed so far. + pub fn write_buffer(&mut self, device: &RenderDevice) { + if !self.is_empty() { + self.reserve(self.len, device); + } + } +} diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs index edf8dd4fabf4a..257ad61dfbb7f 100644 --- a/crates/bevy_sprite/src/mesh2d/mesh.rs +++ b/crates/bevy_sprite/src/mesh2d/mesh.rs @@ -11,12 +11,12 @@ use bevy_ecs::{ }; use bevy_math::{Affine3, Vec4}; use bevy_reflect::{std_traits::ReflectDefault, Reflect}; +use bevy_render::batching::no_gpu_preprocessing::{ + batch_and_prepare_sorted_render_phase, write_batched_instance_buffer, BatchedInstanceBuffer, +}; use bevy_render::mesh::{GpuMesh, MeshVertexBufferLayoutRef}; use bevy_render::{ - batching::{ - batch_and_prepare_sorted_render_phase, write_batched_instance_buffer, GetBatchData, - NoAutomaticBatching, - }, + batching::{GetBatchData, NoAutomaticBatching}, globals::{GlobalsBuffer, GlobalsUniform}, mesh::{GpuBufferInfo, Mesh}, render_asset::RenderAssets, @@ -116,9 +116,13 @@ impl Plugin for Mesh2dRenderPlugin { let mut mesh_bindings_shader_defs = Vec::with_capacity(1); if let Some(render_app) = app.get_sub_app_mut(RenderApp) { - if let Some(per_object_buffer_batch_size) = GpuArrayBuffer::::batch_size( - render_app.world().resource::(), - ) { + let render_device = render_app.world().resource::(); + let batched_instance_buffer = + BatchedInstanceBuffer::::new(render_device); + + if let Some(per_object_buffer_batch_size) = + GpuArrayBuffer::::batch_size(render_device) + { mesh_bindings_shader_defs.push(ShaderDefVal::UInt( "PER_OBJECT_BUFFER_BATCH_SIZE".into(), per_object_buffer_batch_size, @@ -126,9 +130,7 @@ impl Plugin for Mesh2dRenderPlugin { } render_app - .insert_resource(GpuArrayBuffer::::new( - render_app.world().resource::(), - )) + .insert_resource(batched_instance_buffer) .init_resource::(); } @@ -571,9 +573,9 @@ pub fn prepare_mesh2d_bind_group( mut commands: Commands, mesh2d_pipeline: Res, render_device: Res, - mesh2d_uniforms: Res>, + mesh2d_uniforms: Res>, ) { - if let Some(binding) = mesh2d_uniforms.binding() { + if let Some(binding) = mesh2d_uniforms.instance_data_binding() { commands.insert_resource(Mesh2dBindGroup { value: render_device.create_bind_group( "mesh2d_bind_group", diff --git a/examples/shader/shader_instancing.rs b/examples/shader/shader_instancing.rs index b44c96e1c3029..b1c91c8973b07 100644 --- a/examples/shader/shader_instancing.rs +++ b/examples/shader/shader_instancing.rs @@ -127,7 +127,7 @@ fn queue_custom( let view_key = msaa_key | MeshPipelineKey::from_hdr(view.hdr); let rangefinder = view.rangefinder3d(); for entity in &material_meshes { - let Some(mesh_instance) = render_mesh_instances.get(&entity) else { + let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(entity) else { continue; }; let Some(mesh) = meshes.get(mesh_instance.mesh_asset_id) else { @@ -142,8 +142,7 @@ fn queue_custom( entity, pipeline, draw_function: draw_custom, - distance: rangefinder - .distance_translation(&mesh_instance.transforms.transform.translation), + distance: rangefinder.distance_translation(&mesh_instance.translation), batch_range: 0..1, dynamic_offset: None, }); @@ -246,7 +245,8 @@ impl RenderCommand

for DrawMeshInstanced { (meshes, render_mesh_instances): SystemParamItem<'w, '_, Self::Param>, pass: &mut TrackedRenderPass<'w>, ) -> RenderCommandResult { - let Some(mesh_instance) = render_mesh_instances.get(&item.entity()) else { + let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(item.entity()) + else { return RenderCommandResult::Failure; }; let Some(gpu_mesh) = meshes.into_inner().get(mesh_instance.mesh_asset_id) else {