[Misc] add gpu_memory_utilization arg (vllm-project#5079)

Signed-off-by: pandyamarut <pandyamarut@gmail.com>
neuralmagic · Jul 14, 2024 · 2ed01a6 · 2ed01a6
1 parent 4e66b09
commit 2ed01a6
Showing 1 changed file with 8 additions and 1 deletion.
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -35,7 +35,8 @@ def main(args: argparse.Namespace):
               use_v2_block_manager=args.use_v2_block_manager,
               enable_chunked_prefill=args.enable_chunked_prefill,
               download_dir=args.download_dir,
-              block_size=args.block_size)
+              block_size=args.block_size,
+              gpu_memory_utilization=args.gpu_memory_utilization)
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -214,5 +215,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
         type=str,
         default=None,
         help='Path to save the latency results in JSON format.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
     args = parser.parse_args()
     main(args)