forked from NVIDIA/FasterTransformer
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
819 changed files
with
159,828 additions
and
47,203 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
Language: Cpp | ||
AccessModifierOffset: -4 | ||
AlignAfterOpenBracket: Align | ||
AllowShortEnumsOnASingleLine: false | ||
AlignConsecutiveAssignments: false | ||
AlignConsecutiveDeclarations: false | ||
AlignEscapedNewlines: Right | ||
AlignOperands: true | ||
AlignTrailingComments: true | ||
AllowAllParametersOfDeclarationOnNextLine: true | ||
AllowAllArgumentsOnNextLine: true | ||
AllowShortBlocksOnASingleLine: Empty | ||
AllowShortCaseLabelsOnASingleLine: false | ||
AllowShortFunctionsOnASingleLine: Empty | ||
AllowShortIfStatementsOnASingleLine: Never | ||
AllowShortLoopsOnASingleLine: false | ||
AlwaysBreakAfterReturnType: None | ||
AlwaysBreakBeforeMultilineStrings: false | ||
AlwaysBreakTemplateDeclarations: true | ||
BinPackArguments: false | ||
BinPackParameters: false | ||
BreakBeforeBinaryOperators: NonAssignment | ||
BreakBeforeBraces: Stroustrup | ||
BreakBeforeTernaryOperators: false | ||
BreakConstructorInitializers: AfterColon | ||
BreakInheritanceList: AfterColon | ||
BreakStringLiterals: false | ||
ColumnLimit: 120 | ||
CompactNamespaces: false | ||
ConstructorInitializerAllOnOneLineOrOnePerLine: true | ||
ConstructorInitializerIndentWidth: 4 | ||
ContinuationIndentWidth: 4 | ||
Cpp11BracedListStyle: true | ||
DerivePointerAlignment: false | ||
FixNamespaceComments: true | ||
IndentCaseLabels: true | ||
IndentPPDirectives: None | ||
IndentWidth: 4 | ||
IndentWrappedFunctionNames: false | ||
KeepEmptyLinesAtTheStartOfBlocks: true | ||
MaxEmptyLinesToKeep: 1 | ||
NamespaceIndentation: None | ||
PointerAlignment: Left | ||
ReflowComments: true | ||
SortIncludes: true | ||
SortUsingDeclarations: false | ||
SpaceAfterCStyleCast: false | ||
SpaceAfterTemplateKeyword: false | ||
SpaceBeforeAssignmentOperators: true | ||
SpaceBeforeCtorInitializerColon: false | ||
SpaceBeforeInheritanceColon: false | ||
SpaceBeforeParens: ControlStatements | ||
SpaceInEmptyParentheses: false | ||
SpacesBeforeTrailingComments: 2 | ||
SpacesInAngles: false | ||
SpacesInCStyleCastParentheses: false | ||
SpacesInContainerLiterals: false | ||
SpacesInParentheses: false | ||
SpacesInSquareBrackets: false | ||
Standard: Cpp11 | ||
TabWidth: 4 | ||
UseTab: Never |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
[flake8] | ||
ignore = W292 | ||
exclude = | ||
*migrations*, | ||
# python related | ||
*.pyc, | ||
.git, | ||
__pycache__, | ||
|
||
max-line-length=120 | ||
max-complexity=12 | ||
format=pylint | ||
show_source = True | ||
statistics = True | ||
count = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,8 @@ | ||
*~ | ||
*.o | ||
*build*/ | ||
*.pyc | ||
models/ | ||
__pycache__/ | ||
.vscode | ||
./translation | ||
.cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
stages: | ||
- build | ||
- test | ||
|
||
build_pyt_release: | ||
image: nvcr.io/nvidia/pytorch:21.02-py3 | ||
tags: | ||
- fastertransformer | ||
stage: build | ||
only: | ||
- master | ||
- v4.1 | ||
- main | ||
artifacts: | ||
paths: | ||
- ${CI_PROJECT_DIR}/build/ | ||
expire_in: 1 week | ||
script: | ||
- cd ${CI_PROJECT_DIR} && mkdir build && cd build | ||
- git submodule init && git submodule update | ||
- cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_GPT=ON .. | ||
- make -j12 | ||
|
||
build_pyt_release_sparse: | ||
image: nvcr.io/nvidia/pytorch:21.02-py3 | ||
tags: | ||
- fastertransformer | ||
stage: build | ||
only: | ||
- master | ||
- v4.1 | ||
- main | ||
artifacts: | ||
paths: | ||
- ${CI_PROJECT_DIR}/build/ | ||
expire_in: 1 week | ||
script: | ||
- cd ${CI_PROJECT_DIR} && mkdir build && cd build | ||
- git submodule init && git submodule update | ||
- wget https://developer.download.nvidia.com/compute/libcusparse-lt/0.1.0/local_installers/libcusparse_lt-linux-x86_64-0.1.0.2.tar.gz | ||
- tar -xzvf libcusparse_lt-linux-x86_64-0.1.0.2.tar.gz | ||
- cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DSPARSITY_SUPPORT=ON -DCUSPARSELT_PATH=${CI_PROJECT_DIR}/build/libcusparse_lt/ .. | ||
- make -j12 | ||
|
||
build_tf_release: | ||
image: nvcr.io/nvidia/tensorflow:21.02-tf1-py3 | ||
tags: | ||
- fastertransformer | ||
stage: build | ||
only: | ||
- master | ||
- v4.1 | ||
- main | ||
artifacts: | ||
paths: | ||
- ${CI_PROJECT_DIR}/build/ | ||
expire_in: 1 week | ||
script: | ||
- cd ${CI_PROJECT_DIR} && mkdir build && cd build | ||
- git submodule init && git submodule update | ||
- cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TF=ON -DTF_PATH=/usr/local/lib/python3.8/dist-packages/tensorflow_core/ -DBUILD_GPT=ON .. | ||
- make -j12 | ||
- apt-get update && apt-get install bc | ||
|
||
# 1. Get accuracy on LAMBADA dataset | ||
# 2. Run pytorch gpt op as basline | ||
# 3. Run pytorch piepline parallel and compare difference with baseline | ||
# 4. Run pytorch tensor parallel and compare difference with baseline | ||
pyt_gpt_test: | ||
image: nvcr.io/nvidia/pytorch:21.02-py3 | ||
tags: | ||
- fastertransformer | ||
stage: test | ||
only: | ||
- master | ||
- v4.1 | ||
- main | ||
needs: | ||
- job: build_pyt_release | ||
artifacts: true | ||
script: | ||
- cd ${CI_PROJECT_DIR}/build/ | ||
- git submodule init && git submodule update | ||
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH" | ||
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32 | ||
- export CUDA_VISIBLE_DEVICES=0 | ||
- wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P ../models | ||
- wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P ../models | ||
- wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip | ||
- wget https://github.com/cybertronai/bflm/raw/master/lambada_test.jsonl -P ../models/megatron-models | ||
- unzip megatron_lm_345m_v0.0.zip -d ../models/megatron-models/345m | ||
- python ../examples/pytorch/gpt/utils/megatron_ckpt_convert.py -head_num 16 -i ../models/megatron-models/345m/release/ -o ../models/megatron-models/c-model/345m/ -t_g 1 -i_g 1 | ||
- bash ../examples/pytorch/gpt/scripts/evaluate_zeroshot_gpt.sh | ||
- python ../examples/pytorch/gpt/gpt_example.py --ckpt_path=../models/megatron-models/c-model/345m/1-gpu/ --top_p 0.5 --sample_output_file single-gpu-out.txt | ||
- export CUDA_VISIBLE_DEVICES=0,1 | ||
- mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gpt/multi_gpu_gpt_example.py --tensor_para_size=1 --pipeline_para_size=2 --ckpt_path=../models/megatron-models/c-model/345m/1-gpu/ --top_p 0.5 --sample_output_file pipeline-parallel-2-gpu-out.txt | ||
- diff single-gpu-out.txt pipeline-parallel-2-gpu-out.txt | ||
- python ../examples/pytorch/gpt/utils/megatron_ckpt_convert.py -head_num 16 -i ../models/megatron-models/345m/release/ -o ../models/megatron-models/c-model/345m/ -t_g 1 -i_g 2 | ||
- mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gpt/multi_gpu_gpt_example.py --tensor_para_size=2 --pipeline_para_size=1 --ckpt_path=../models/megatron-models/c-model/345m/2-gpu/ --top_p 0.5 --sample_output_file tensor-parallel-2-gpu-out.txt | ||
- diff single-gpu-out.txt tensor-parallel-2-gpu-out.txt | ||
timeout: 4h 30m | ||
|
||
tf_test: | ||
image: nvcr.io/nvidia/tensorflow:21.02-tf1-py3 | ||
tags: | ||
- fastertransformer | ||
stage: test | ||
only: | ||
- master | ||
- v4.1 | ||
- main | ||
needs: | ||
- job: build_tf_release | ||
artifacts: true | ||
script: | ||
- cd ${CI_PROJECT_DIR}/build/ | ||
- apt-get update && apt-get install bc | ||
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH" | ||
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32 | ||
- export CUDA_VISIBLE_DEVICES=0 | ||
- bash ${CI_PROJECT_DIR}/examples/tensorflow/decoding/utils/translation/download_model_data.sh | ||
- mkdir -p ${CI_PROJECT_DIR}/translation/ckpt_fp16 | ||
- python ${CI_PROJECT_DIR}/examples/tensorflow/ckpt_type_convert.py --init_checkpoint=${CI_PROJECT_DIR}/translation/ckpt/model.ckpt-500000 --fp16_checkpoint=${CI_PROJECT_DIR}/translation/ckpt_fp16/model.ckpt-500000 | ||
- python ${CI_PROJECT_DIR}/tests/decoding/tf_decoding_unit_test.py | ||
timeout: 4h 30m | ||
|
||
tf_xlnet_test: | ||
image: nvcr.io/nvidia/tensorflow:21.02-tf1-py3 | ||
tags: | ||
- fastertransformer | ||
stage: test | ||
only: | ||
- master | ||
- v4.1 | ||
- main | ||
needs: | ||
- job: build_tf_release | ||
artifacts: true | ||
script: | ||
- cd ${CI_PROJECT_DIR}/examples/tensorflow/xlnet | ||
- bash downloadModel.sh | ||
- bash verifyCorrectness.sh # For FP32 model | ||
|
||
pyt_sp_test: | ||
image: nvcr.io/nvidia/pytorch:21.02-py3 | ||
tags: | ||
- fastertransformer | ||
stage: test | ||
only: | ||
- master | ||
- v4.1 | ||
- main | ||
needs: | ||
- job: build_pyt_release_sparse | ||
artifacts: true | ||
script: | ||
- cd ${CI_PROJECT_DIR}/build/ | ||
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH" | ||
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32 | ||
- export CUDA_VISIBLE_DEVICES=0 | ||
- pip install transformers==2.5.1 | ||
# GOS has no Ampere GPU, so no sparse tests can be done. only test some dense cases | ||
- ${CI_PROJECT_DIR}/build/bin/bert_gemm 32 64 12 64 1 0 | ||
- python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 | ||
- ${CI_PROJECT_DIR}/build/bin/bert_gemm 32 64 12 64 1 1 | ||
- python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 --int8_mode 1 | ||
- python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 --int8_mode 2 | ||
- python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 --int8_mode 3 | ||
|
||
pyt_longformer_test: | ||
image: nvcr.io/nvidia/pytorch:21.02-py3 | ||
tags: | ||
- fastertransformer | ||
stage: test | ||
only: | ||
- master | ||
- v4.1 | ||
- main | ||
needs: | ||
- job: build_pyt_release | ||
artifacts: true | ||
script: | ||
- cd ${CI_PROJECT_DIR}/examples/pytorch/longformer | ||
- apt-get update && apt-get install git-lfs | ||
- git lfs install | ||
- git config lfs.fetchinclude "pytorch_model.bin,config.json" | ||
- git clone https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa | ||
- cd ${CI_PROJECT_DIR} | ||
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH" | ||
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32 | ||
- export CUDA_VISIBLE_DEVICES=0 | ||
- pip install transformers==4.8.2 | ||
- python3 tests/longformer/py_longformer_unit_test.py | ||
|
||
pyt_decoding_test: | ||
image: nvcr.io/nvidia/pytorch:21.02-py3 | ||
tags: | ||
- fastertransformer | ||
stage: test | ||
only: | ||
- master | ||
- v4.1 | ||
- main | ||
needs: | ||
- job: build_pyt_release | ||
artifacts: true | ||
script: | ||
- cd ${CI_PROJECT_DIR}/build/ | ||
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH" | ||
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32 | ||
- export CUDA_VISIBLE_DEVICES=0 | ||
- apt-get update && apt-get install bc | ||
- pip install sacrebleu | ||
- pip install opennmt-py==1.1.1 | ||
- bash ../examples/pytorch/decoding/utils/download_model.sh | ||
- mkdir pytorch/translation/data -p | ||
- cp ../examples/tensorflow/decoding/utils/translation/test* pytorch/translation/data | ||
- python ../examples/pytorch/decoding/utils/recover_bpe.py pytorch/translation/data/test.de debpe_ref.txt | ||
- echo "Run decoding fp32" # decoding fp32 testing | ||
- python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type decoding_ext --decoding_ths_path ./lib/libth_decoding.so --data_type fp32 --output_file output.txt | ||
- python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt | ||
- cat debpe_output.txt | sacrebleu debpe_ref.txt | ||
- echo "Run decoder fp32" # decoder fp32 testing | ||
- python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type torch_decoding_with_decoder_ext --decoder_ths_path ./lib/libth_decoder.so --data_type fp32 --output_file output.txt | ||
- python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt | ||
- cat debpe_output.txt | sacrebleu debpe_ref.txt | ||
- echo "Run decoding fp16" # decoding fp16 testing | ||
- python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type decoding_ext --decoding_ths_path ./lib/libth_decoding.so --data_type fp16 --output_file output.txt | ||
- python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt | ||
- cat debpe_output.txt | sacrebleu debpe_ref.txt | ||
- echo "Run decoder fp16" # decoder fp16 testing | ||
- python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type torch_decoding_with_decoder_ext --decoder_ths_path ./lib/libth_decoder.so --data_type fp16 --output_file output.txt | ||
- python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt | ||
- cat debpe_output.txt | sacrebleu debpe_ref.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[submodule "3rdparty/Megatron-LM"] | ||
path = 3rdparty/Megatron-LM | ||
url = https://github.com/NVIDIA/Megatron-LM.git | ||
branch = v2.4 | ||
[submodule "examples/tensorflow/bert/tensorflow_bert/bert"] | ||
path = examples/tensorflow/bert/tensorflow_bert/bert | ||
url = https://github.com/google-research/bert.git |
Oops, something went wrong.