Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] demonstrate using NVTXW to export internal CUPTI events to nsys-rep #2450

Draft
wants to merge 2 commits into
base: branch-24.10
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/main/cpp/profiler/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ configure_file(

add_executable(spark_rapids_profile_converter
spark_rapids_profile_converter.cpp
initialize_nvtxw.cpp
nvtxw3.cpp
nvtxw3.h
NvtxwEvents.cpp
NvtxwEvents.h
"${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/profiler_schema.cpp"
"${SPARK_RAPIDS_JNI_GENERATED_SOURCE_DIR}/spark_rapids_jni_version.cpp"
"${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}/profiler_generated.h"
Expand All @@ -86,6 +91,8 @@ target_include_directories(
spark_rapids_profile_converter
PRIVATE
"${CUDAToolkit_INCLUDE_DIRS}"
"${SPARK_RAPIDS_JNI_SOURCE_DIR}"
"${SPARK_RAPIDS_JNI_SOURCE_DIR}/profiler"
"${SPARK_RAPIDS_JNI_SOURCE_DIR}/src"
"${SPARK_RAPIDS_JNI_GENERATED_INCLUDE_DIR}"
)
Expand Down
413 changes: 413 additions & 0 deletions src/main/cpp/profiler/NvtxwEvents.cpp

Large diffs are not rendered by default.

188 changes: 188 additions & 0 deletions src/main/cpp/profiler/NvtxwEvents.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
/*
* SPDX-FileCopyrightText: Copyright (c) <year> NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copyright year not updated from template, same license diff question here.

* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Licensed under the Apache License v2.0 with LLVM Exceptions.
* See LICENSE.txt for license information.
*/

#pragma once

#include <stdint.h>
#include <nvtx3/nvToolsExtPayload.h>

namespace NvidiaNvtxw
{

namespace PayloadSchemaId
{
static constexpr uint64_t nameId = 0xffffff00;
static constexpr uint64_t nvtxRangePushPopId = 0xffffff01;
static constexpr uint64_t cuptiApiId = 0xffffff02;
static constexpr uint64_t cuptiMemcpyId = 0xffffff03;
static constexpr uint64_t cuptiMemsetId = 0xffffff04;
static constexpr uint64_t cuptiDeviceId = 0xffffff05;
static constexpr uint64_t cuptiKernelId = 0xffffff06;
static constexpr uint64_t cuptiOverheadId = 0xffffff07;
static constexpr uint64_t nvtxRangeStartEndId = 0xffffff08;
};

const nvtxPayloadSchemaAttr_t* GetNameSchemaAttr();

struct nvtxRangeEvent {
uint64_t time_start;
uint64_t time_stop;
const char* name;
uint32_t process_id;
uint32_t thread_id;
uint32_t color;
};
const nvtxPayloadSchemaAttr_t* GetNvtxRangePushPopSchemaAttr();
const nvtxPayloadSchemaAttr_t* GetNvtxRangeStartEndSchemaAttr();
struct cuptiApiEvent {
uint64_t time_start;
uint64_t time_stop;
uint32_t kind;
uint32_t cbid;
uint32_t process_id;
uint32_t thread_id;
uint32_t correlation_id;
uint32_t return_value;
};
const nvtxPayloadSchemaAttr_t* GetCuptiApiSchemaAttr();
struct cuptiDevice {
uint64_t global_memory_bandwidth;
uint64_t global_memory_size;
uint32_t constant_memory_size;
uint32_t l2_cache_size;
uint32_t num_threads_per_warp;
uint32_t core_clock_rate;
uint32_t num_memcpy_engines;
uint32_t num_multiprocessors;
uint32_t max_ipc;
uint32_t max_warps_per_multiprocessor;
uint32_t max_blocks_per_multiprocessor;
uint32_t max_shared_memory_per_multiprocessor;
uint32_t max_registers_per_multiprocessor;
uint32_t max_registers_per_block;
uint32_t max_shared_memory_per_block;
uint32_t max_threads_per_block;
uint32_t max_block_dim_x;
uint32_t max_block_dim_y;
uint32_t max_block_dim_z;
uint32_t max_grid_dim_x;
uint32_t max_grid_dim_y;
uint32_t max_grid_dim_z;
uint32_t compute_capability_major;
uint32_t compute_capability_minor;
uint32_t id;
uint32_t ecc_enabled;
const char* name;
};
const nvtxPayloadSchemaAttr_t* GetCuptiDeviceSchemaAttr();
struct cuptiKernelEvent {
uint64_t time_start;
uint64_t time_stop;
uint64_t completed;
uint64_t grid_id;
uint64_t queued;
uint64_t submitted;
uint64_t graph_node_id;
uint64_t local_memory_total_v2;
const char * name;
uint32_t device_id;
uint32_t context_id;
uint32_t stream_id;
uint32_t process_id;
uint32_t grid_x;
uint32_t grid_y;
uint32_t grid_z;
uint32_t block_x;
uint32_t block_y;
uint32_t block_z;
uint32_t static_shared_memory;
uint32_t dynamic_shared_memory;
uint32_t local_memory_per_thread;
uint32_t local_memory_total;
uint32_t correlation_id;
uint32_t shared_memory_executed;
uint32_t graph_id;
uint32_t channel_id;
uint32_t cluster_x;
uint32_t cluster_y;
uint32_t cluster_z;
uint32_t cluster_scheduling_policy;
uint16_t registers_per_thread;
uint8_t requested;
uint8_t executed;
uint8_t shared_memory_config;
uint8_t partitioned_global_cache_requested;
uint8_t partitioned_global_cache_executed;
uint8_t launch_type;
uint8_t is_shared_memory_carveout_requested;
uint8_t shared_memory_carveout_requested;
uint8_t shmem_limit_config;
uint8_t channel_type;
};
const nvtxPayloadSchemaAttr_t* GetCuptiKernelSchemaAttr();

struct cuptiMemcpyEvent {
uint64_t time_start;
uint64_t time_stop;
uint64_t bytes;
uint64_t graph_node_id;
uint32_t device_id;
uint32_t context_id;
uint32_t stream_id;
uint32_t process_id;
uint32_t correlation_id;
uint32_t runtime_correlation_id;
uint32_t graph_id;
uint32_t channel_id;
uint8_t channelType;
uint8_t copy_kind;
uint8_t src_kind;
uint8_t dst_kind;
};
const nvtxPayloadSchemaAttr_t* GetCuptiMemcpySchemaAttr();

struct cuptiMemsetEvent {
uint64_t time_start;
uint64_t time_stop;
uint64_t bytes;
uint64_t graph_node_id;
uint32_t device_id;
uint32_t context_id;
uint32_t stream_id;
uint32_t process_id;
uint32_t correlation_id;
uint32_t graph_id;
uint32_t channel_id;
uint32_t value;
uint8_t channelType;
uint8_t mem_kind;
uint8_t flags;
};
const nvtxPayloadSchemaAttr_t* GetCuptiMemsetSchemaAttr();
struct cuptiOverheadEvent {
uint64_t time_start;
uint64_t time_stop;
uint32_t process_id;
uint32_t thread_id;
uint8_t overhead_kind;
};
const nvtxPayloadSchemaAttr_t* GetCuptiOverheadSchemaAttr();

}
22 changes: 22 additions & 0 deletions src/main/cpp/profiler/README-nvtxw.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
1. NvtxwEvents.h, NvtxwEvents.cpp are copied from Nsight Systems source code. They need to be kept in sync between this project and Nsight Systems.

2. Need to set the NVTXW_BACKEND environment variable for the libNvtxwBackend.so library in the host directory a current build of Nsight Systems. For example:
> export NVTXW_BACKEND=/opt/nvidia/nsight-systems/2024.6.0/host-linux-x64/libNvtxwBackend.so
Comment on lines +3 to +4
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be an cmdline option of the converter? That makes it much more obvious to the user how to set it. Either way, we should update the error when the backend library is not found to point to how the user can override the path to find it.


3. Run like this:
> ./target/jni/cmake-build/profiler/spark_rapids_profile_converter -w -o file3021460.nsys-rep rapids-profile-3021460@jlowe-lcedt-driver.bin
and get output similar to this:
Backend implementation loaded! Applying config string...
Loader config key/value pairs not provided
Creating report: "file3021460.nsys-rep"
- Created session: file3021460
Session config key/value pairs not provided
- Created stream: Stream1
Domain: SparkRAPIDS
Scope:
- Destroyed stream: Stream1
3946 events imported
- Destroyed session: file3021460
Backend implementation prepared for unload.

4. Load into nsight systems UI: nsys-ui file3021460.nsys-rep
Loading
Loading