Fix static quantization for QDQ and Percentile distribution (#17649)

### Description One quantization case was not covered by the current list of unit tests. This PR adds a unit test to cover that case with the fix. It fixes the issue #17619. ### Motivation and Context
microsoft · Sep 25, 2023 · 905faea · 905faea
1 parent df15a3a
commit 905faea
Show file tree

Hide file tree

Showing 7 changed files with 13,909 additions and 7 deletions.
diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@@ -77,7 +77,8 @@ class QLinearConv : public OpKernel {
     W_zero_point_value = W_zero_point_data[0];
     for (int64_t i = 1; i < W_zero_point_size; i++) {
       ORT_ENFORCE(W_zero_point_data[i] == W_zero_point_value,
-                  "QLinearConv : zero point of per-channel filter must be same");
+                  "QLinearConv : zero point of per-channel filter must be same. "
+                  "This happens by design if the quantization is symmetric.");
     }
   }
 

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
@@ -22,7 +22,7 @@
 
 
 class TensorData:
-    _allowed = frozenset(["avg", "std", "lowest", "highest", "hist", "hist_edges"])
+    _allowed = frozenset(["avg", "std", "lowest", "highest", "hist", "hist_edges", "bins"])
 
     def __init__(self, **kwargs):
         for k, v in kwargs.items():
@@ -55,7 +55,7 @@ def __init__(self, calibration_method, data: Dict[str, Union[TensorData, Tuple]]
                     self.data[k] = TensorData(lowest=v[0], highest=v[1])
                     continue
                 if len(v) == 4:
-                    self.data[k] = TensorData(lowest=v[0], highest=v[1], histogram=v[2], bins=v[3])
+                    self.data[k] = TensorData(lowest=v[0], highest=v[1], hist=v[2], bins=v[3])
                     continue
                 raise TypeError(f"Unexpected tuple for {k:r}, it has {len(v)} elements: {v}.")
             if not isinstance(v, TensorData):

diff --git a/onnxruntime/python/tools/quantization/operators/conv.py b/onnxruntime/python/tools/quantization/operators/conv.py
@@ -157,7 +157,7 @@ def quantize(self):
                 nodes,
             ) = self.quantizer.quantize_activation(node, [0])
             quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
-                node.input[1], onnx_proto.TensorProto.INT8, 0
+                node.input[1], onnx_proto.TensorProto.INT8, 0  # self.quantizer.weight_qType?
             )
             quantized_input_names.append(quant_weight_tuple[0])
             zero_point_names.append(quant_weight_tuple[1])

diff --git a/onnxruntime/python/tools/quantization/operators/lstm.py b/onnxruntime/python/tools/quantization/operators/lstm.py
@@ -47,10 +47,10 @@ def quantize(self):
             R.dims[0] = R_num_dir * R_4_hidden_size
 
         quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
-            node.input[1], onnx_proto.TensorProto.INT8, 0
+            node.input[1], onnx_proto.TensorProto.INT8, 0  # self.quantizer.weight_qType?
         )
         quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
-            node.input[2], onnx_proto.TensorProto.INT8, 0
+            node.input[2], onnx_proto.TensorProto.INT8, 0  # self.quantizer.weight_qType?
         )
 
         W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])  # noqa: N806

diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -283,7 +283,13 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):
                 raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.")
             q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel(
                 weight_name,
-                self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType,
+                # Quantization type is forced to be TensorProto.INT8.
+                # when the expected value would be (see below)
+                # self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType.
+                # QLinearConv expects to have a unique value for all channels.
+                # This code does not enforce that but it is necessarily the case when the
+                # quantization is symmetric (as for INT8).
+                onnx_proto.TensorProto.INT8,
                 axis,
                 keep_float_weight=self.add_qdq_pair_to_weight,
             )