done

vllm-project · pcmoritz · May 22, 2024 · May 17, 2024 · May 17, 2024 · May 17, 2024
commit 61ddb2290aec64b2ecaa275b76c66f4639ba0202
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
@@ -25,12 +25,12 @@
         "auto": [
             'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
             'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-            'Zeta-5, a highly advanced robot designed for menial labor, whirred to a',
+            'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both',
+            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
             'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
             'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o',
+            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no'
         ],
         "fp8": [
             'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
@@ -75,7 +75,7 @@
 
 @pytest.mark.skipif(fp8_not_supported,
                     reason="fp8 is not supported on this GPU type.")
-@pytest.mark.parametrize("model_name", ["meta-llama/Meta-Llama-3-8B-Instruct"])
+@pytest.mark.parametrize("model_name", MODELS)
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
     model = LLM(model=model_name,
@@ -105,7 +105,7 @@ def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
         generations.append(outputs[0].outputs[0].text)
     del model
 
-    print(generations)
+    print(model_name, kv_cache_dtype, generations)
     expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
     for i in range(len(example_prompts)):
         generated_str = generations[i]

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -278,17 +278,20 @@ def apply(self, layer: torch.nn.Module) -> torch.Tensor:
         raise RuntimeError("Fp8KVCacheMethod.apply should not be called.")
 
     def process_weights_after_loading(self, layer: Module) -> None:
-        kv_scale = layer.kv_scale.to("cpu").tolist()
+        # If the kv-cache dtype is auto, we enforce the kv-scale to be 1.0
+        # regardless whether the kv-scale is available in the checkpoint.
+        if layer.kv_cache_dtype != "auto":
+            kv_scale = layer.kv_scale.to("cpu").tolist()
+            if not isinstance(kv_scale, float):
+                raise ValueError("Only support per-tensor scaling factor "
+                                 "for fp8 KV cache")
+            layer._kv_scale = kv_scale
+            if layer._kv_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype:
+                print_warning_once(
+                    "Using KV cache scaling factor 1.0 for fp8_e4m3. This may "
+                    "cause accuracy issues. Please make sure kv-cache scaling "
+                    "factor is available in the fp8 checkpoint.")
         del layer.kv_scale
-        if not isinstance(kv_scale, float):
-            raise ValueError("Only support per-tensor scaling factor "
-                             "for fp8 KV cache")
-        layer._kv_scale = 1.0  # kv_scale
-        if layer._kv_scale == 1.0:
-            print_warning_once(
-                "Using KV cache scaling factor 1.0 for fp8_e4m3. This may "
-                "cause accuracy issues. Please make sure kv-cache scaling "
-                "factor is available in the fp8 checkpoint.")
 
 
 def all_close_1d(x: torch.Tensor) -> bool:

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -47,7 +47,7 @@
     default_weight_loader, kv_cache_scales_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplerOutput
-from vllm.utils import is_hip
+from vllm.utils import is_hip, print_warning_once
 
 
 class LlamaMLP(nn.Module):
@@ -412,6 +412,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Remapping the name of FP8 kv-scale.
+                if name.endswith("kv_scale"):
+                    remapped_kv_scale_name = name.replace(
+                        ".kv_scale", ".attn.kv_scale")
+                    if remapped_kv_scale_name not in params_dict:
+                        print_warning_once(
+                            f"Found kv scale in the checkpoint (e.g. {name}), "
+                            "but not found the expected name in the model "
+                            f"(e.g. {remapped_kv_scale_name}). kv-scale is "
+                            "not loaded.")
+                        continue
+                    else:
+                        name = remapped_kv_scale_name
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -580,6 +580,20 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale")
+                        if remapped_kv_scale_name not in params_dict:
+                            print_warning_once(
+                                "Found kv scale in the checkpoint "
+                                f"(e.g. {name}), but not found the expected "
+                                f"name in the model "
+                                f"(e.g. {remapped_kv_scale_name}). "
+                                "kv-scale is not loaded.")
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
                     param = params_dict[name]
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)