Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test_initcap failed with seed 14 #9247

Open
Tracked by #9241
thirtiseven opened this issue Sep 18, 2023 · 10 comments
Open
Tracked by #9241

test_initcap failed with seed 14 #9247

thirtiseven opened this issue Sep 18, 2023 · 10 comments
Assignees
Labels
cudf_dependency An issue or PR with this label depends on a new feature in cudf

Comments

@thirtiseven
Copy link
Collaborator

thirtiseven commented Sep 18, 2023

Changing seed to some other values like 14 will fail test_initcap:

@incompat
def test_initcap():
    # Because we don't use the same unicode version we need to limit
    # the charicter set to something more reasonable
    # upper and lower should cover the corner cases, this is mostly to
    # see if there are issues with spaces
    gen = mk_str_gen('([aAbB1357ȺéŸ_@%-]{0,15}[ \r\n\t]{1,2}){1,5}')
    assert_gpu_and_cpu_are_equal_collect(
            lambda spark: unary_op_df(spark, gen, seed=14).select(
                f.initcap(f.col('a'))))

log:

________________________________________________________________________ test_initcap _________________________________________________________________________
[gw0] linux -- Python 3.8.3 /home/haoyangl/.pyenv/versions/3.8.3/bin/python

    @incompat
    # @pytest.mark.parametrize('seed', [i for i in range(30)], ids=idfn)
    def test_initcap():
        # Because we don't use the same unicode version we need to limit
        # the charicter set to something more reasonable
        # upper and lower should cover the corner cases, this is mostly to
        # see if there are issues with spaces
        gen = mk_str_gen('([aAbB1357ȺéŸ_@%-]{0,15}[ \r\n\t]{1,2}){1,5}')
>       assert_gpu_and_cpu_are_equal_collect(
                lambda spark: unary_op_df(spark, gen, seed=14).select(
                    f.initcap(f.col('a'))))

../../src/main/python/string_test.py:661:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../src/main/python/asserts.py:566: in assert_gpu_and_cpu_are_equal_collect
    _assert_gpu_and_cpu_are_equal(func, 'COLLECT', conf=conf, is_cpu_first=is_cpu_first)
../../src/main/python/asserts.py:497: in _assert_gpu_and_cpu_are_equal
    assert_equal(from_cpu, from_gpu)
../../src/main/python/asserts.py:107: in assert_equal
    _assert_equal(cpu, gpu, float_check=get_float_check(), path=[])
../../src/main/python/asserts.py:43: in _assert_equal
    _assert_equal(cpu[index], gpu[index], float_check, path + [index])
../../src/main/python/asserts.py:36: in _assert_equal
    _assert_equal(cpu[field], gpu[field], float_check, path + [field])
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

cpu = 'ß«¾îí\x87\x04av~', gpu = 'SS«¾îí\x87\x04av~', float_check = <function get_float_check.<locals>.<lambda> at 0x7f481df77ee0>, path = [80, 'initcap(a)']

    def _assert_equal(cpu, gpu, float_check, path):
        t = type(cpu)
        if (t is Row):
            assert len(cpu) == len(gpu), "CPU and GPU row have different lengths at {} CPU: {} GPU: {}".format(path, len(cpu), len(gpu))
            if hasattr(cpu, "__fields__") and hasattr(gpu, "__fields__"):
                assert cpu.__fields__ == gpu.__fields__, "CPU and GPU row have different fields at {} CPU: {} GPU: {}".format(path, cpu.__fields__, gpu.__fields__)
                for field in cpu.__fields__:
                    _assert_equal(cpu[field], gpu[field], float_check, path + [field])
            else:
                for index in range(len(cpu)):
                    _assert_equal(cpu[index], gpu[index], float_check, path + [index])
        elif (t is list):
            assert len(cpu) == len(gpu), "CPU and GPU list have different lengths at {} CPU: {} GPU: {}".format(path, len(cpu), len(gpu))
            for index in range(len(cpu)):
                _assert_equal(cpu[index], gpu[index], float_check, path + [index])
        elif (t is tuple):
            assert len(cpu) == len(gpu), "CPU and GPU list have different lengths at {} CPU: {} GPU: {}".format(path, len(cpu), len(gpu))
            for index in range(len(cpu)):
                _assert_equal(cpu[index], gpu[index], float_check, path + [index])
        elif (t is pytypes.GeneratorType):
            index = 0
            # generator has no zip :( so we have to do this the hard way
            done = False
            while not done:
                sub_cpu = None
                sub_gpu = None
                try:
                    sub_cpu = next(cpu)
                except StopIteration:
                    done = True

                try:
                    sub_gpu = next(gpu)
                except StopIteration:
                    done = True

                if done:
                    assert sub_cpu == sub_gpu and sub_cpu == None, "CPU and GPU generators have different lengths at {}".format(path)
                else:
                    _assert_equal(sub_cpu, sub_gpu, float_check, path + [index])

                index = index + 1
        elif (t is dict):
            # The order of key/values is not guaranteed in python dicts, nor are they guaranteed by Spark
            # so sort the items to do our best with ignoring the order of dicts
            cpu_items = list(cpu.items()).sort(key=_RowCmp)
            gpu_items = list(gpu.items()).sort(key=_RowCmp)
            _assert_equal(cpu_items, gpu_items, float_check, path + ["map"])
        elif (t is int):
            assert cpu == gpu, "GPU and CPU int values are different at {}".format(path)
        elif (t is float):
            if (math.isnan(cpu)):
                assert math.isnan(gpu), "GPU and CPU float values are different at {}".format(path)
            else:
                assert float_check(cpu, gpu), "GPU and CPU float values are different {}".format(path)
        elif isinstance(cpu, str):
>           assert cpu == gpu, "GPU and CPU string values are different at {}".format(path)
E           AssertionError: GPU and CPU string values are different at [80, 'initcap(a)']

../../src/main/python/asserts.py:85: AssertionError
-------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------
### CPU RUN ###
### GPU RUN ###
### COLLECT: GPU TOOK 0.5010926723480225 CPU TOOK 3.5713346004486084 ###
--- CPU OUTPUT
+++ GPU OUTPUT
@@ -78,7 +78,7 @@
 Row(initcap(a)='@13_-%7b@b%5é_a\r Bb-é3béé375a17@\t B3b_@é-@@b3béⱥÿ\r 5773ÿé3ⱥb%5ⱥ5%a\r\t75ⱥbaa3bb5@é-é@ \r')
 Row(initcap(a)='_aaa-_@ⱥ5@-5a5ÿ\r%é3é1ⱥ_aba77_%a\t 55@15b7ab%1ÿb_3\r\r_777%-33-ÿé_7%b\t A_-5a73ÿ755a5éb \n')
 Row(initcap(a)='A3-bÿ7é3ÿb7a-bÿ\n\rⱥÿ-@-é-77ⱥÿ-@5b\ra7-ba77%b-ⱥ7@51\n\r51@a5bab-b3b5_b Ⱥ@%aa@-ab_a11é1 \n')
-Row(initcap(a)='ß«¾îí\x87\x04av~')
+Row(initcap(a)='SS«¾îí\x87\x04av~')
 Row(initcap(a)='Éé%éaa7-ba%b1é3 \t553-a1ⱥ_bbbb@1%\n55b%aba@ÿ55ÿⱥÿÿ\n7-3b-3ÿééⱥ@7-_\r\nab_-a73b7é1-3ⱥ_\t ')
 Row(initcap(a)='@ÿⱥéa_b15_3éⱥ__\r Éa@53a@%1aaa1bb \rⱥbaaÿ-é_71b5-_5\r\n1@a57bab7b3ébé3\n\né1%_1ⱥⱥ5bÿ7a%-5\r\n')
 Row(initcap(a)=None)
@revans2
Copy link
Collaborator

revans2 commented Sep 19, 2023

This looks like we found a character that we do not translate to upper case properly. I'll try to do some more digging here.

@revans2 revans2 self-assigned this Sep 19, 2023
@revans2
Copy link
Collaborator

revans2 commented Sep 20, 2023

So the characters in the output do not match the characters listed in the regexp. I don't think sre_yield is doing what we want/expect here and going off of bytes instead of characters. Need to dig in a little bit more to understand if this is expected or not.

@revans2
Copy link
Collaborator

revans2 commented Sep 20, 2023

Okay, now I am even more confused. I added in upper and lower in addition to initcap just to see what would happen, and the CPU is doing something rather odd.

-Row(a='ß«¾îí\x87\x04Av~', IC='ß«¾îí\x87\x04av~', U='SS«¾ÎÍ\x87\x04AV~', L='ß«¾îí\x87\x04av~')
+Row(a='ß«¾îí\x87\x04Av~', IC='SS«¾îí\x87\x04av~', U='SS«¾ÎÍ\x87\x04AV~', L='ß«¾îí\x87\x04av~')

We are consistent and make ß upper case as the first character, but the CPU keeps it lower case and I really want to understand why...

@revans2
Copy link
Collaborator

revans2 commented Sep 20, 2023

OK on further digging this is a real bug in our code. title case and upper case are not the same thing. We are converting characters to upper case and lower case using the cudf strings::capitalize function. But it converts the values to upper case, which at least in the case of ß is not the same as title case. We are likely going to need some help from cudf to make this work properly.

@revans2 revans2 removed their assignment Sep 20, 2023
@revans2 revans2 added the cudf_dependency An issue or PR with this label depends on a new feature in cudf label Sep 20, 2023
@revans2
Copy link
Collaborator

revans2 commented Sep 20, 2023

@revans2
Copy link
Collaborator

revans2 commented Sep 20, 2023

rapidsai/cudf#14144

@revans2 revans2 self-assigned this Sep 21, 2023
@revans2
Copy link
Collaborator

revans2 commented Sep 21, 2023

CUDF agreed to fix the issue so I will keep this assigned to me to verify the fix.

@jlowe
Copy link
Member

jlowe commented Nov 30, 2023

Saw another instance of this with a different seed:

[2023-11-29T23:15:09.589Z] FAILED ../../../../integration_tests/src/main/python/string_test.py::test_initcap[DATAGEN_SEED=1701294231, INCOMPAT] - AssertionError: GPU and CPU string values are different at [1854, 'initcap(a)']

@firestarman
Copy link
Collaborator

moved to 24.02

@revans2
Copy link
Collaborator

revans2 commented Jan 22, 2024

Just for reference there are a number of characters that do not match between the CPU and the GPU. It is not too hard to do an exhaustive check in scala.

(0 to 65536).map(_.toChar.toString).toDF("a").repartition(1).selectExpr("a", "initCap(a) as b")

This showed 265 characters where we got the wrong answer. The code point values for these are.

(223, 304, 329, 452, 454, 455, 457, 458, 460, 496, 497, 499, 604, 609, 618, 620, 642, 647, 669, 670, 912, 944, 1011, 1012, 1321, 1323, 1325, 1327, 1415, 4304, 4305, 4306, 4307, 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4320, 4321, 4322, 4323, 4324, 4325, 4326, 4327, 4328, 4329, 4330, 4331, 4332, 4333, 4334, 4335, 4336, 4337, 4338, 4339, 4340, 4341, 4342, 4343, 4344, 4345, 4346, 4349, 4350, 4351, 5112, 5113, 5114, 5115, 5116, 5117, 7296, 7297, 7298, 7299, 7300, 7301, 7302, 7303, 7304, 7566, 7830, 7831, 7832, 7833, 7834, 7838, 8016, 8018, 8020, 8022, 8064, 8065, 8066, 8067, 8068, 8069, 8070, 8071, 8080, 8081, 8082, 8083, 8084, 8085, 8086, 8087, 8096, 8097, 8098, 8099, 8100, 8101, 8102, 8103, 8114, 8115, 8116, 8118, 8119, 8130, 8131, 8132, 8134, 8135, 8146, 8147, 8150, 8151, 8162, 8163, 8164, 8166, 8167, 8178, 8179, 8180, 8182, 8183, 8486, 8490, 8491, 42649, 42651, 42900, 42903, 42905, 42907, 42909, 42911, 42933, 42935, 42937, 42939, 42941, 42943, 42947, 43859, 43888, 43889, 43890, 43891, 43892, 43893, 43894, 43895, 43896, 43897, 43898, 43899, 43900, 43901, 43902, 43903, 43904, 43905, 43906, 43907, 43908, 43909, 43910, 43911, 43912, 43913, 43914, 43915, 43916, 43917, 43918, 43919, 43920, 43921, 43922, 43923, 43924, 43925, 43926, 43927, 43928, 43929, 43930, 43931, 43932, 43933, 43934, 43935, 43936, 43937, 43938, 43939, 43940, 43941, 43942, 43943, 43944, 43945, 43946, 43947, 43948, 43949, 43950, 43951, 43952, 43953, 43954, 43955, 43956, 43957, 43958, 43959, 43960, 43961, 43962, 43963, 43964, 43965, 43966, 43967, 64256, 64257, 64258, 64259, 64260, 64261, 64262, 64265, 64266, 64267, 64268, 64269, 64275, 64276, 64277, 64278, 64279)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
cudf_dependency An issue or PR with this label depends on a new feature in cudf
Projects
None yet
Development

No branches or pull requests

4 participants