Column equality testing fixes (#10011)

Fixes a bug where empty columns were not comparing correctly as well as a few edge cases with strings Partially addresses #8513 Authors: - https://github.com/brandon-b-miller Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Michael Wang (https://github.com/isVoid) URL: #10011
rapidsai · Feb 8, 2022 · acb6aed · acb6aed
1 parent 6502cea
commit acb6aed
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 22 deletions.
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
@@ -9,10 +9,31 @@
 import pandas as pd
 
 import cudf
-from cudf.api.types import is_categorical_dtype, is_numeric_dtype
+from cudf._lib.unary import is_nan
+from cudf.api.types import (
+    is_categorical_dtype,
+    is_decimal_dtype,
+    is_interval_dtype,
+    is_list_dtype,
+    is_numeric_dtype,
+    is_string_dtype,
+    is_struct_dtype,
+)
 from cudf.core._compat import PANDAS_GE_110
 
 
+def dtype_can_compare_equal_to_other(dtype):
+    # return True if values of this dtype can compare
+    # as equal to equal values of a different dtype
+    return not (
+        is_string_dtype(dtype)
+        or is_list_dtype(dtype)
+        or is_struct_dtype(dtype)
+        or is_decimal_dtype(dtype)
+        or is_interval_dtype(dtype)
+    )
+
+
 def _check_isinstance(left, right, obj):
     if not isinstance(left, obj):
         raise AssertionError(
@@ -146,6 +167,9 @@ def assert_column_equal(
                 msg1 = f"{left.dtype}"
                 msg2 = f"{right.dtype}"
                 raise_assert_detail(obj, "Dtypes are different", msg1, msg2)
+    else:
+        if left.null_count == len(left) and right.null_count == len(right):
+            return True
 
     if check_datetimelike_compat:
         if np.issubdtype(left.dtype, np.datetime64):
@@ -201,39 +225,72 @@ def assert_column_equal(
     ):
         left = left.astype(left.categories.dtype)
         right = right.astype(right.categories.dtype)
-
     columns_equal = False
-    try:
-        columns_equal = (
-            (
-                cp.all(left.isnull().values == right.isnull().values)
-                and cp.allclose(
+    if left.size == right.size == 0:
+        columns_equal = True
+    elif not (
+        (
+            not dtype_can_compare_equal_to_other(left.dtype)
+            and is_numeric_dtype(right)
+        )
+        or (
+            is_numeric_dtype(left)
+            and not dtype_can_compare_equal_to_other(right)
+        )
+    ):
+        try:
+            # nulls must be in the same places for all dtypes
+            columns_equal = cp.all(
+                left.isnull().values == right.isnull().values
+            )
+
+            if columns_equal and not check_exact and is_numeric_dtype(left):
+                # non-null values must be the same
+                columns_equal = cp.allclose(
                     left[left.isnull().unary_operator("not")].values,
                     right[right.isnull().unary_operator("not")].values,
                 )
-            )
-            if not check_exact and is_numeric_dtype(left)
-            else left.equals(right)
-        )
-    except TypeError as e:
-        if str(e) != "Categoricals can only compare with the same type":
-            raise e
-        if is_categorical_dtype(left) and is_categorical_dtype(right):
-            left = left.astype(left.categories.dtype)
-            right = right.astype(right.categories.dtype)
+                if columns_equal and (
+                    left.dtype.kind == right.dtype.kind == "f"
+                ):
+                    columns_equal = cp.all(
+                        is_nan(left).values == is_nan(right).values
+                    )
+            else:
+                columns_equal = left.equals(right)
+        except TypeError as e:
+            if str(e) != "Categoricals can only compare with the same type":
+                raise e
+            else:
+                columns_equal = False
+            if is_categorical_dtype(left) and is_categorical_dtype(right):
+                left = left.astype(left.categories.dtype)
+                right = right.astype(right.categories.dtype)
     if not columns_equal:
-        msg1 = f"{left.values_host}"
-        msg2 = f"{right.values_host}"
+        ldata = str([val for val in left.to_pandas(nullable=True)])
+        rdata = str([val for val in right.to_pandas(nullable=True)])
         try:
-            diff = left.apply_boolean_mask(left != right).size
+            diff = 0
+            for i in range(left.size):
+                if not null_safe_scalar_equals(left[i], right[i]):
+                    diff += 1
             diff = diff * 100.0 / left.size
         except BaseException:
             diff = 100.0
         raise_assert_detail(
-            obj, f"values are different ({np.round(diff, 5)} %)", msg1, msg2,
+            obj,
+            f"values are different ({np.round(diff, 5)} %)",
+            {ldata},
+            {rdata},
         )
 
 
+def null_safe_scalar_equals(left, right):
+    if left in {cudf.NA, np.nan} or right in {cudf.NA, np.nan}:
+        return left is right
+    return left == right
+
+
 def assert_index_equal(
     left,
     right,
@@ -358,7 +415,6 @@ def assert_index_equal(
                 obj=mul_obj,
             )
         return
-
     assert_column_equal(
         left._columns[0],
         right._columns[0],

diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
@@ -5,12 +5,14 @@
 import pytest
 
 import cudf
+from cudf.core.column.column import as_column, full
 from cudf.testing import (
     assert_frame_equal,
     assert_index_equal,
     assert_series_equal,
 )
 from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing.testing import assert_column_equal
 
 
 @pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]])
@@ -119,6 +121,36 @@ def test_basic_assert_series_equal(
         )
 
 
+@pytest.mark.parametrize(
+    "other",
+    [
+        as_column(["1", "2", "3"]),
+        as_column([[1], [2], [3]]),
+        as_column([{"a": 1}, {"a": 2}, {"a": 3}]),
+    ],
+)
+def test_assert_column_equal_dtype_edge_cases(other):
+    # string series should be 100% different
+    # even when the elements are the same
+    base = as_column([1, 2, 3])
+
+    # for these dtypes, the diff should always be 100% regardless of the values
+    with pytest.raises(
+        AssertionError, match=r".*values are different \(100.0 %\).*"
+    ):
+        assert_column_equal(base, other, check_dtype=False)
+
+    # the exceptions are the empty and all null cases
+    assert_column_equal(base[:0], other[:0], check_dtype=False)
+    assert_column_equal(other[:0], base[:0], check_dtype=False)
+
+    base = full(len(base), fill_value=cudf.NA, dtype=base.dtype)
+    other = full(len(other), fill_value=cudf.NA, dtype=other.dtype)
+
+    assert_column_equal(base, other, check_dtype=False)
+    assert_column_equal(other, base, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "rdtype", [["int8", "int16", "int64"], ["int64", "int16", "int8"]]
 )