Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FP32 and INT4 test in Llama2 #21187

Merged
merged 12 commits into from
Jun 27, 2024
Prev Previous commit
Next Next commit
FP32 and INT4 test
  • Loading branch information
Yi Zhang committed Jun 27, 2024
commit c72fc2e85e30f6899a6ff9fdfec61ed316634f65
27 changes: 24 additions & 3 deletions tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ stages:
workingDirectory: $(Build.SourcesDirectory)
condition: ne(variables.hitAnother, 'True')

- stage: Llama2_ONNX_FP16
- stage: Llama2_7B_ONNX
dependsOn:
- Build_Onnxruntime_Cuda
jobs:
Expand Down Expand Up @@ -346,7 +346,7 @@ stages:
python3 -m pip install /ort-artifact/*.whl ; \
python3 -m pip uninstall -y torch ; \
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\
python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --small_gp;\
popd ; \
"
displayName: 'Run Llama2 to Onnx F16 and parity Test'
Expand All @@ -367,12 +367,33 @@ stages:
python3 -m pip install /ort-artifact/*.whl ; \
python3 -m pip uninstall -y torch ; \
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp32-gpu --precision fp32 --execution_provider cuda --input /meta-llama2 ;\
python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp32-gpu --precision fp32 --execution_provider cuda;\
popd ; \
"
displayName: 'Run Llama2 to Onnx fp32 and parity Test'
workingDirectory: $(Build.SourcesDirectory)

- script: |
docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
-v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
-v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
onnxruntimeubi8packagestest \
bash -c "
set -ex; \
pushd /workspace/onnxruntime/python/tools/transformers/ ; \
python3 -m pip install --upgrade pip ; \
pushd models/llama ; \
python3 -m pip install -r requirements.txt ; \
popd ; \
python3 -m pip install /ort-artifact/*.whl ; \
python3 -m pip uninstall -y torch ; \
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-int4-gpu --precision int4 --execution_provider cuda --use_gqa;\
popd ; \
"
displayName: 'Run Llama2 to Onnx INT4 and parity Test'
workingDirectory: $(Build.SourcesDirectory)

- stage: Whisper_ONNX
dependsOn:
- Build_Onnxruntime_Cuda
Expand Down
Loading