string

vllm-project · pcmoritz · May 22, 2024 · May 17, 2024 · May 17, 2024 · May 17, 2024
commit e909f289c1c77857c84b54d7edad32d9f06c28f5
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -181,13 +181,13 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--profile", action="store_true")
     parser.add_argument(
-        '--kv-cache-dtype',
+        "--kv-cache-dtype",
         type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
         default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+        help="Data type for kv cache storage. If 'auto', will use model "
+        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
     args = parser.parse_args()
     print(args)
 

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -10,6 +10,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -287,7 +288,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                              "for fp8 KV cache")
         layer._kv_scale = kv_scales[0]
         if layer._kv_scale == 1.0:
-            logger.warning(
+            print_warning_once(
                 "Using KV cache scaling factor 1.0 for fp8_e4m3. This may "
                 "cause accuracy issues. Please make sure kv-cache scaling "
                 "factor is available in the fp8 checkpoint.")