Skip to content

Commit

Permalink
Change target tritonfastertransformerbackend to trtonturbomindbackend (
Browse files Browse the repository at this point in the history
…InternLM#36)

* change target tritonfastertransformerbackend to tritonturbomindbackend

* install targets to backends/turbomind

* changge model_dir
  • Loading branch information
lvhan028 authored Jul 1, 2023
1 parent 35d6446 commit 70e6ab2
Show file tree
Hide file tree
Showing 11 changed files with 56 additions and 56 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -374,8 +374,8 @@ install(
transformer-shared
EXPORT
transformer-shared-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
)

install(
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ Run one of the following commands to serve a LLaMA model on NVIDIA GPU server:
```shell
python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```

</details>
Expand All @@ -111,7 +111,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
```shell
python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```

</details>
Expand All @@ -129,7 +129,7 @@ python3 -m fastchat.model.apply_delta \
--delta-path lmsys/vicuna-7b-delta-v1.1

python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```

</details>
Expand All @@ -145,7 +145,7 @@ python3 -m fastchat.model.apply_delta \
--delta-path lmsys/vicuna-13b-delta-v1.1

python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```

</details>
Expand Down
8 changes: 4 additions & 4 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ make -j$(nproc) && make install
```shell
python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```

</details>
Expand All @@ -109,7 +109,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
```shell
python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```

</details>
Expand All @@ -127,7 +127,7 @@ python3 -m fastchat.model.apply_delta \
--delta-path lmsys/vicuna-7b-delta-v1.1

python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```

</details>
Expand All @@ -143,7 +143,7 @@ python3 -m fastchat.model.apply_delta \
--delta-path lmsys/vicuna-13b-delta-v1.1

python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```

</details>
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions examples/cpp/llama/llama_config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
data_type=fp16
enable_custom_all_reduce=0
pipeline_para_size=1
tensor_para_size=8
model_dir=/shared_data/chatpjlm-0/v0.2.3/fastertransformer/weights/
tensor_para_size=1
model_dir=/workspace/models/triton_models/weights/


[request]
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/serve/turbomind/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def get_param(_name, _size):
del ckpt

for name, param in model_params.items():
# transpose all weights as FasterTransformer is expecting column-major
# transpose all weights as TurboMind is expecting column-major
# weights: (output_dims, input_dims) -> (input_dims, output_dims)
key = name.split('.')[-2]
if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/serve/turbomind/service_docker_up.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ show_help() {
echo
echo "Options:"
echo " -h, --help Show this help message and exit"
echo " --lib-dir Specify the directory of fastertransformer libraries"
echo " --lib-dir Specify the directory of turbomind libraries"
}

# check if '-h' or '--help' in the arguments
Expand Down Expand Up @@ -64,7 +64,7 @@ for ((i = 1; i <= $#; i++)); do
docker run \
--gpus $DEVICES \
--rm \
-v "${LIB_PATH}":/opt/tritonserver/backends/fastertransformer \
-v "${LIB_PATH}":/opt/tritonserver/backends/turbomind \
-v ""${SCRIPT_ABS_DIR}"":/workspace/models \
--shm-size 16g \
-p 33336:22 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

name: "turbomind"
backend: "fastertransformer"
backend: "turbomind"
default_model_filename: "weights"
max_batch_size: 1

Expand Down
2 changes: 1 addition & 1 deletion src/turbomind/models/llama/llama_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ size_t curandStateGetSize()
bool isDebug()
{
static const bool is_debug = [] {
const auto level = std::getenv("FT_DEBUG_LEVEL");
const auto level = std::getenv("TM_DEBUG_LEVEL");
if (level && level == std::string("DEBUG")) {
return true;
}
Expand Down
66 changes: 33 additions & 33 deletions src/turbomind/triton_backend/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

cmake_minimum_required (VERSION 3.18)

project(tritonfastertransformerbackend LANGUAGES C CXX)
project(tritonturbomindbackend LANGUAGES C CXX)

#
# Options
Expand Down Expand Up @@ -89,12 +89,12 @@ endif() # TRITON_ENABLE_GPU
configure_file(libtriton_fastertransformer.ldscript libtriton_fastertransformer.ldscript COPYONLY)

add_library(
triton-fastertransformer-backend SHARED
triton-turbomind-backend SHARED
libfastertransformer.cc
)

add_library(
TritonFasterTransformerBackend::triton-fastertransformer-backend ALIAS triton-fastertransformer-backend
TritonTurboMindBackend::triton-turbomind-backend ALIAS triton-turbomind-backend
)

find_package(CUDAToolkit REQUIRED)
Expand All @@ -106,13 +106,13 @@ endif()

set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})

target_compile_definitions(triton-fastertransformer-backend
target_compile_definitions(triton-turbomind-backend
PUBLIC
USE_TRITONSERVER_DATATYPE
BUILD_MULTI_GPU)

target_include_directories(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
${TRITON_PYTORCH_INCLUDE_PATHS}
Expand All @@ -123,31 +123,31 @@ target_include_directories(
)

target_link_directories(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${CUDA_PATH}/lib64
)

target_compile_features(triton-fastertransformer-backend PRIVATE cxx_std_14)
target_compile_features(triton-turbomind-backend PRIVATE cxx_std_14)

target_compile_options(
triton-fastertransformer-backend PRIVATE
triton-turbomind-backend PRIVATE
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-Wall -Wextra -Wno-unused-parameter -Wno-type-limits >#-Werror>
)

if(${TRITON_ENABLE_GPU})
target_compile_definitions(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE TRITON_ENABLE_GPU=1
)
endif() # TRITON_ENABLE_GPU

set_target_properties(
triton-fastertransformer-backend
triton-turbomind-backend
PROPERTIES
POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_fastertransformer
OUTPUT_NAME triton_turbomind
SKIP_BUILD_RPATH TRUE
BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH_USE_LINK_PATH FALSE
Expand All @@ -159,7 +159,7 @@ set_target_properties(
# Need to turn off unused-but-set-variable due to Torchvision
# Need to turn off unknown-pragmas due to ATen OpenMP
set_target_properties(
triton-fastertransformer-backend
triton-turbomind-backend
PROPERTIES COMPILE_FLAGS
"-Wno-unknown-pragmas -Wno-unused-but-set-variable"
)
Expand All @@ -170,7 +170,7 @@ FOREACH(p ${TRITON_PYTORCH_LIB_PATHS})
ENDFOREACH(p)

target_link_libraries(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
triton-core-serverapi # from repo-core
triton-core-backendapi # from repo-core
Expand All @@ -186,23 +186,23 @@ target_link_libraries(

if (BUILD_MULTI_GPU)
target_compile_definitions(
triton-fastertransformer-backend
triton-turbomind-backend
PUBLIC
BUILD_MULTI_GPU
)
target_include_directories(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${MPI_INCLUDE_PATH}
)
target_link_directories(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${MPI_Libraries}
/usr/local/mpi/lib
)
target_link_libraries(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${NCCL_LIBRARIES}
${MPI_LIBRARIES}
Expand All @@ -211,7 +211,7 @@ endif()

if(${TRITON_ENABLE_GPU})
target_link_libraries(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
CUDA::cudart
)
Expand All @@ -221,51 +221,51 @@ endif() # TRITON_ENABLE_GPU
# Install
#
include(GNUInstallDirs)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonFasterTransformerBackend)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TurboMindBackend)

install(
TARGETS
triton-fastertransformer-backend
triton-turbomind-backend
EXPORT
triton-fastertransformer-backend-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
triton-turbomind-backend-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
)

install(
EXPORT
triton-fastertransformer-backend-targets
triton-turbomind-backend-targets
FILE
TritonFasterTransformerBackendTargets.cmake
TritonTurboMindBackendTargets.cmake
NAMESPACE
TritonFasterTransformerBackend::
TritonTurboMindBackend::
DESTINATION
${INSTALL_CONFIGDIR}
)

include(CMakePackageConfigHelpers)
configure_package_config_file(
${CMAKE_SOURCE_DIR}/cmake/TritonFasterTransformerBackendConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
${CMAKE_SOURCE_DIR}/cmake/TritonTurboMindBackendConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendConfig.cmake
INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
)

install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendConfig.cmake
DESTINATION ${INSTALL_CONFIGDIR}
)

#
# Export from build tree
#
export(
EXPORT triton-fastertransformer-backend-targets
FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendTargets.cmake
NAMESPACE TritonFasterTransformerBackend::
EXPORT triton-turbomind-backend-targets
FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendTargets.cmake
NAMESPACE TritonTurboMindBackend::
)

export(PACKAGE TritonFasterTransformerBackend)
export(PACKAGE TritonTurboMindBackend)


# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
Expand Down
12 changes: 6 additions & 6 deletions src/turbomind/triton_backend/libfastertransformer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -511,11 +511,11 @@ TRITONSERVER_Error* ModelState::AutoCompleteConfig()
}
}
else {
// Auto-complete configuration is not supported since fastertransformer does
// Auto-complete configuration is not supported since turbomind does
// not store/capture sufficient model metadata so just log error instead.
LOG_MESSAGE(TRITONSERVER_LOG_WARN,
(std::string("skipping model configuration auto-complete for '") + Name()
+ "': not supported for fastertransformer backend")
+ "': not supported for turbomind backend")
.c_str());
}

Expand Down Expand Up @@ -940,7 +940,7 @@ void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const
request_count,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
std::string("null request given to FasterTransformer backend for '" + Name() + "'").c_str()));
std::string("null request given to TurboMind backend for '" + Name() + "'").c_str()));
return;
}

Expand Down Expand Up @@ -1115,7 +1115,7 @@ void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const
for (auto& response : responses) {
if (response != nullptr) {
LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
"failed to send FasterTransformer backend response");
"failed to send TurboMind backend response");
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("response is sent")).c_str());
}
else {
Expand Down Expand Up @@ -1160,7 +1160,7 @@ void streaming_callback(std::shared_ptr<std::unordered_map<std::string, Tensor>>
if (response != nullptr) {
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str());
LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr),
"failed to send FasterTransformer backend response");
"failed to send TurboMind backend response");
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str());
}
else {
Expand Down Expand Up @@ -1358,7 +1358,7 @@ ModelInstanceState::Execute(std::vector<TRITONBACKEND_Response*>*
responses,
response_count,
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
("FasterTransformer execute failure: " + std::string(ex.what())).c_str()));
("TurboMind execute failure: " + std::string(ex.what())).c_str()));
}
auto output_tensors = output_tensors_list[0];
return output_tensors;
Expand Down

0 comments on commit 70e6ab2

Please sign in to comment.