Skip to content

Commit

Permalink
[Misc] add gpu_memory_utilization arg (vllm-project#5079)
Browse files Browse the repository at this point in the history
Signed-off-by: pandyamarut <pandyamarut@gmail.com>
  • Loading branch information
pandyamarut authored and robertgshaw2-neuralmagic committed Jul 14, 2024
1 parent 4e66b09 commit 2ed01a6
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def main(args: argparse.Namespace):
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size)
block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization)

sampling_params = SamplingParams(
n=args.n,
Expand Down Expand Up @@ -214,5 +215,11 @@ def run_to_completion(profile_dir: Optional[str] = None):
type=str,
default=None,
help='Path to save the latency results in JSON format.')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
args = parser.parse_args()
main(args)

0 comments on commit 2ed01a6

Please sign in to comment.