Update the dev/v5.0_beta

hurun · Sep 30, 2021 · 952a1f2 · 952a1f2
1 parent dd4c071
commit 952a1f2
Show file tree

Hide file tree

Showing 819 changed files with 159,828 additions and 47,203 deletions.
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,62 @@
+Language: Cpp
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AllowShortEnumsOnASingleLine: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Right
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowAllArgumentsOnNextLine: true
+AllowShortBlocksOnASingleLine: Empty
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeBraces: Stroustrup
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializers: AfterColon
+BreakInheritanceList: AfterColon
+BreakStringLiterals: false
+ColumnLimit: 120
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+FixNamespaceComments: true
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PointerAlignment: Left
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCtorInitializerColon: false
+SpaceBeforeInheritanceColon: false
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+TabWidth: 4
+UseTab: Never
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,15 @@
+[flake8]
+ignore = W292
+exclude =
+    *migrations*,
+    # python related
+    *.pyc,
+    .git,
+    __pycache__,
+
+max-line-length=120
+max-complexity=12
+format=pylint
+show_source = True
+statistics = True
+count = True
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,8 @@
 *~
 *.o
 *build*/
-*.pyc
+models/
+__pycache__/
+.vscode
+./translation
+.cache
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -0,0 +1,234 @@
+stages:
+  - build
+  - test
+
+build_pyt_release:
+  image: nvcr.io/nvidia/pytorch:21.02-py3
+  tags:
+    - fastertransformer
+  stage: build
+  only:
+    - master
+    - v4.1
+    - main
+  artifacts:
+    paths:
+      - ${CI_PROJECT_DIR}/build/
+    expire_in: 1 week
+  script:
+    - cd ${CI_PROJECT_DIR} && mkdir build && cd build
+    - git submodule init && git submodule update
+    - cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_GPT=ON ..
+    - make -j12
+
+build_pyt_release_sparse:
+  image: nvcr.io/nvidia/pytorch:21.02-py3
+  tags:
+    - fastertransformer
+  stage: build
+  only:
+    - master
+    - v4.1
+    - main
+  artifacts:
+    paths:
+      - ${CI_PROJECT_DIR}/build/
+    expire_in: 1 week
+  script:
+    - cd ${CI_PROJECT_DIR} && mkdir build && cd build
+    - git submodule init && git submodule update
+    - wget https://developer.download.nvidia.com/compute/libcusparse-lt/0.1.0/local_installers/libcusparse_lt-linux-x86_64-0.1.0.2.tar.gz
+    - tar -xzvf libcusparse_lt-linux-x86_64-0.1.0.2.tar.gz
+    - cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DSPARSITY_SUPPORT=ON -DCUSPARSELT_PATH=${CI_PROJECT_DIR}/build/libcusparse_lt/ ..
+    - make -j12
+
+build_tf_release:
+  image: nvcr.io/nvidia/tensorflow:21.02-tf1-py3
+  tags:
+    - fastertransformer
+  stage: build
+  only:
+    - master
+    - v4.1
+    - main
+  artifacts:
+    paths:
+      - ${CI_PROJECT_DIR}/build/
+    expire_in: 1 week
+  script:
+    - cd ${CI_PROJECT_DIR} && mkdir build && cd build
+    - git submodule init && git submodule update
+    - cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TF=ON -DTF_PATH=/usr/local/lib/python3.8/dist-packages/tensorflow_core/ -DBUILD_GPT=ON ..
+    - make -j12
+    - apt-get update && apt-get install bc
+
+# 1. Get accuracy on LAMBADA dataset
+# 2. Run pytorch gpt op as basline
+# 3. Run pytorch piepline parallel and compare difference with baseline
+# 4. Run pytorch tensor parallel and compare difference with baseline
+pyt_gpt_test:
+  image: nvcr.io/nvidia/pytorch:21.02-py3
+  tags:
+    - fastertransformer
+  stage: test
+  only:
+    - master
+    - v4.1
+    - main
+  needs:
+    - job: build_pyt_release
+      artifacts: true
+  script:
+    - cd ${CI_PROJECT_DIR}/build/
+    - git submodule init && git submodule update
+    - export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
+    - export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
+    - export CUDA_VISIBLE_DEVICES=0
+    - wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P ../models
+    - wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P ../models
+    - wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip
+    - wget https://github.com/cybertronai/bflm/raw/master/lambada_test.jsonl -P ../models/megatron-models
+    - unzip megatron_lm_345m_v0.0.zip -d ../models/megatron-models/345m
+    - python ../examples/pytorch/gpt/utils/megatron_ckpt_convert.py -head_num 16 -i ../models/megatron-models/345m/release/ -o ../models/megatron-models/c-model/345m/ -t_g 1 -i_g 1
+    - bash ../examples/pytorch/gpt/scripts/evaluate_zeroshot_gpt.sh
+    - python ../examples/pytorch/gpt/gpt_example.py --ckpt_path=../models/megatron-models/c-model/345m/1-gpu/ --top_p 0.5 --sample_output_file single-gpu-out.txt
+    - export CUDA_VISIBLE_DEVICES=0,1
+    - mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gpt/multi_gpu_gpt_example.py --tensor_para_size=1 --pipeline_para_size=2 --ckpt_path=../models/megatron-models/c-model/345m/1-gpu/ --top_p 0.5 --sample_output_file pipeline-parallel-2-gpu-out.txt
+    - diff single-gpu-out.txt pipeline-parallel-2-gpu-out.txt
+    - python ../examples/pytorch/gpt/utils/megatron_ckpt_convert.py -head_num 16 -i ../models/megatron-models/345m/release/ -o ../models/megatron-models/c-model/345m/ -t_g 1 -i_g 2
+    - mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gpt/multi_gpu_gpt_example.py --tensor_para_size=2 --pipeline_para_size=1 --ckpt_path=../models/megatron-models/c-model/345m/2-gpu/ --top_p 0.5 --sample_output_file tensor-parallel-2-gpu-out.txt
+    - diff single-gpu-out.txt tensor-parallel-2-gpu-out.txt
+  timeout: 4h 30m
+
+tf_test:
+  image: nvcr.io/nvidia/tensorflow:21.02-tf1-py3
+  tags:
+    - fastertransformer
+  stage: test
+  only:
+    - master
+    - v4.1
+    - main
+  needs:
+    - job: build_tf_release
+      artifacts: true
+  script:
+    - cd ${CI_PROJECT_DIR}/build/
+    - apt-get update && apt-get install bc
+    - export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
+    - export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
+    - export CUDA_VISIBLE_DEVICES=0
+    - bash ${CI_PROJECT_DIR}/examples/tensorflow/decoding/utils/translation/download_model_data.sh
+    - mkdir -p ${CI_PROJECT_DIR}/translation/ckpt_fp16
+    - python ${CI_PROJECT_DIR}/examples/tensorflow/ckpt_type_convert.py --init_checkpoint=${CI_PROJECT_DIR}/translation/ckpt/model.ckpt-500000 --fp16_checkpoint=${CI_PROJECT_DIR}/translation/ckpt_fp16/model.ckpt-500000
+    - python ${CI_PROJECT_DIR}/tests/decoding/tf_decoding_unit_test.py
+  timeout: 4h 30m
+
+tf_xlnet_test:
+  image: nvcr.io/nvidia/tensorflow:21.02-tf1-py3
+  tags:
+    - fastertransformer
+  stage: test
+  only:
+    - master
+    - v4.1
+    - main
+  needs:
+    - job: build_tf_release
+      artifacts: true
+  script:
+    - cd ${CI_PROJECT_DIR}/examples/tensorflow/xlnet
+    - bash downloadModel.sh
+    - bash verifyCorrectness.sh # For FP32 model
+
+pyt_sp_test:
+  image: nvcr.io/nvidia/pytorch:21.02-py3
+  tags:
+    - fastertransformer
+  stage: test
+  only:
+    - master
+    - v4.1
+    - main
+  needs:
+    - job: build_pyt_release_sparse
+      artifacts: true
+  script:
+    - cd ${CI_PROJECT_DIR}/build/
+    - export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
+    - export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
+    - export CUDA_VISIBLE_DEVICES=0
+    - pip install transformers==2.5.1
+    # GOS has no Ampere GPU, so no sparse tests can be done. only test some dense cases
+    - ${CI_PROJECT_DIR}/build/bin/bert_gemm 32 64 12 64 1 0
+    - python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16
+    - ${CI_PROJECT_DIR}/build/bin/bert_gemm 32 64 12 64 1 1
+    - python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 --int8_mode 1
+    - python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 --int8_mode 2
+    - python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 --int8_mode 3
+
+pyt_longformer_test:
+  image: nvcr.io/nvidia/pytorch:21.02-py3
+  tags:
+    - fastertransformer
+  stage: test
+  only:
+    - master
+    - v4.1
+    - main
+  needs:
+    - job: build_pyt_release
+      artifacts: true
+  script:
+    - cd ${CI_PROJECT_DIR}/examples/pytorch/longformer
+    - apt-get update && apt-get install git-lfs
+    - git lfs install
+    - git config lfs.fetchinclude "pytorch_model.bin,config.json"
+    - git clone https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa
+    - cd ${CI_PROJECT_DIR}
+    - export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
+    - export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
+    - export CUDA_VISIBLE_DEVICES=0
+    - pip install transformers==4.8.2
+    - python3 tests/longformer/py_longformer_unit_test.py
+
+pyt_decoding_test:
+  image: nvcr.io/nvidia/pytorch:21.02-py3
+  tags:
+    - fastertransformer
+  stage: test
+  only:
+    - master
+    - v4.1
+    - main
+  needs:
+    - job: build_pyt_release
+      artifacts: true
+  script:
+    - cd ${CI_PROJECT_DIR}/build/
+    - export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
+    - export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
+    - export CUDA_VISIBLE_DEVICES=0
+    - apt-get update && apt-get install bc
+    - pip install sacrebleu
+    - pip install opennmt-py==1.1.1
+    - bash ../examples/pytorch/decoding/utils/download_model.sh
+    - mkdir pytorch/translation/data -p
+    - cp ../examples/tensorflow/decoding/utils/translation/test* pytorch/translation/data
+    - python ../examples/pytorch/decoding/utils/recover_bpe.py pytorch/translation/data/test.de debpe_ref.txt
+    - echo "Run decoding fp32" # decoding fp32 testing
+    - python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type decoding_ext --decoding_ths_path ./lib/libth_decoding.so --data_type fp32 --output_file output.txt
+    - python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt
+    - cat debpe_output.txt | sacrebleu debpe_ref.txt
+    - echo "Run decoder fp32" # decoder fp32 testing
+    - python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type torch_decoding_with_decoder_ext --decoder_ths_path ./lib/libth_decoder.so --data_type fp32 --output_file output.txt
+    - python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt
+    - cat debpe_output.txt | sacrebleu debpe_ref.txt
+    - echo "Run decoding fp16" # decoding fp16 testing
+    - python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type decoding_ext --decoding_ths_path ./lib/libth_decoding.so --data_type fp16 --output_file output.txt
+    - python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt
+    - cat debpe_output.txt | sacrebleu debpe_ref.txt
+    - echo "Run decoder fp16" # decoder fp16 testing
+    - python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type torch_decoding_with_decoder_ext --decoder_ths_path ./lib/libth_decoder.so --data_type fp16 --output_file output.txt
+    - python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt
+    - cat debpe_output.txt | sacrebleu debpe_ref.txt
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,7 @@
+[submodule "3rdparty/Megatron-LM"]
+	path = 3rdparty/Megatron-LM
+	url = https://github.com/NVIDIA/Megatron-LM.git
+	branch = v2.4
+[submodule "examples/tensorflow/bert/tensorflow_bert/bert"]
+	path = examples/tensorflow/bert/tensorflow_bert/bert
+	url = https://github.com/google-research/bert.git