casper-hansen · casper-hansen · Apr 6, 2024 · Apr 6, 2024 · Apr 6, 2024 · Apr 6, 2024
diff --git a/awq/models/base.py b/awq/models/base.py
@@ -136,6 +136,12 @@ def quantize(
                 "This argument avoids real quantization by only applying the scales without quantizing down to FP16."
             ),
         ] = False,
+        apply_clip: Annotated[
+            bool,
+            Doc(
+                "Whether to apply clipping to the model during quantization. Some models may perform better with this set to False."
+            ),
+        ] = True,
     ):
         """
         The main quantization function that you can use to quantize your model.
@@ -173,6 +179,7 @@ def quantize(
             duo_scaling,
             modules_to_not_convert=self.quant_config.modules_to_not_convert,
             export_compatible=export_compatible,
+            apply_clip=apply_clip,
         )
         self.quantizer.quantize()
 

diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py
@@ -40,6 +40,7 @@ def __init__(
         duo_scaling,
         modules_to_not_convert=None,
         export_compatible=False,
+        apply_clip=True,
     ) -> None:
         self.awq_model = awq_model
         self.model = model
@@ -53,6 +54,7 @@ def __init__(
         self.text_column = text_column
         self.duo_scaling = duo_scaling
         self.export_compatible = export_compatible
+        self.apply_clip = apply_clip
         self.modules_to_not_convert = (
             modules_to_not_convert if modules_to_not_convert is not None else []
         )
@@ -161,13 +163,14 @@ def quantize(self):
             )
 
             # [STEP 3]: Compute and apply clipping list
-            clip_list = self._search_best_clip(
-                self.modules[i], named_linears, input_feat
-            )
-            apply_clip(self.modules[i], clip_list)
-            clip_list = append_str_prefix(
-                clip_list, get_op_name(self.model, self.modules[i]) + "."
-            )
+            if self.apply_clip:
+                clip_list = self._search_best_clip(
+                    self.modules[i], named_linears, input_feat
+                )
+                apply_clip(self.modules[i], clip_list)
+                clip_list = append_str_prefix(
+                    clip_list, get_op_name(self.model, self.modules[i]) + "."
+                )
 
             # [STEP 4]: Quantize weights
             if not self.export_compatible: