Add tests reading/writing/appending/updating unicode strings (#1599)

#### Reference Issues/PRs Adds tests for PR #1559 #### What does this implement or fix? Adds tests from appending and updating dfs with unicode strings. Also adds tests for the other similar operations (some like reading/writing are folded into the append tests) #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
man-group · Jun 5, 2024 · 59bf687 · 59bf687
1 parent bf1d788
commit 59bf687
Show file tree

Hide file tree

Showing 3 changed files with 159 additions and 33 deletions.
diff --git a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py
@@ -5,6 +5,7 @@
 
 As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
 """
+
 import itertools
 import time
 import sys
@@ -121,7 +122,7 @@ def test_s3_breaking_chars_exception_compat(object_version_store):
 @pytest.mark.parametrize("suffix", ["", "suffix"])
 def test_symbol_names_with_all_chars(object_version_store, prefix, suffix):
     # Create symbol names with each character (except '\' because Azure replaces it with '/' in some cases)
-    names = [f"{prefix}{chr(i)}{suffix}" for i in range(256) if chr(i) != '\\']
+    names = [f"{prefix}{chr(i)}{suffix}" for i in range(256) if chr(i) != "\\"]
     df = sample_dataframe()
 
     written_symbols = set()
@@ -317,8 +318,7 @@ def test_prune_previous_versions_multiple_times(basic_store, symbol):
 
 
 def test_prune_previous_versions_write_batch(basic_store):
-    """Verify that the batch write method correctly prunes previous versions when the corresponding option is specified.
-    """
+    """Verify that the batch write method correctly prunes previous versions when the corresponding option is specified."""
     # Given
     lib = basic_store
     lib_tool = lib.library_tool()
@@ -348,8 +348,7 @@ def test_prune_previous_versions_write_batch(basic_store):
 
 
 def test_prune_previous_versions_batch_write_metadata(basic_store):
-    """Verify that the batch write metadata method correctly prunes previous versions when the corresponding option is specified.
-    """
+    """Verify that the batch write metadata method correctly prunes previous versions when the corresponding option is specified."""
     # Given
     lib = basic_store
     lib_tool = lib.library_tool()
@@ -379,8 +378,7 @@ def test_prune_previous_versions_batch_write_metadata(basic_store):
 
 
 def test_prune_previous_versions_append_batch(basic_store):
-    """Verify that the batch append method correctly prunes previous versions when the corresponding option is specified.
-    """
+    """Verify that the batch append method correctly prunes previous versions when the corresponding option is specified."""
     # Given
     lib = basic_store
     lib_tool = lib.library_tool()
@@ -409,6 +407,47 @@ def test_prune_previous_versions_append_batch(basic_store):
     assert len(lib_tool.find_keys(KeyType.SYMBOL_LIST)) == 4
 
 
+def test_batch_append_unicode(basic_store):
+    symbol = "test_append_unicode"
+    uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
+
+    df1 = pd.DataFrame(
+        index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
+        data={"a": ["123", uc]},
+    )
+    basic_store.batch_write(symbols=[symbol], data_vector=[df1])
+    vit = basic_store.batch_read([symbol])[symbol]
+    assert_frame_equal(vit.data, df1)
+
+    df2 = pd.DataFrame(
+        index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
+        data={"a": ["123", uc]},
+    )
+    basic_store.batch_append(symbols=[symbol], data_vector=[df2])
+    vit = basic_store.batch_read([symbol])[symbol]
+    expected = pd.concat([df1, df2])
+    assert_frame_equal(vit.data, expected)
+
+
+def test_batch_write_metadata_unicode(basic_store):
+    symbol = "test_append_unicode"
+    uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
+    df1 = pd.DataFrame(
+        index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
+        data={"a": ["123", uc]},
+    )
+
+    basic_store.batch_write(symbols=[symbol], data_vector=[df1])
+    vit = basic_store.batch_read([symbol])[symbol]
+    assert_frame_equal(vit.data, df1)
+
+    meta = {"a": 1, "b": uc}
+    basic_store.batch_write_metadata(symbols=[symbol], metadata_vector=[meta])
+    vits = basic_store.batch_read_metadata([symbol])
+    metadata = vits[symbol].metadata
+    assert metadata == meta
+
+
 def test_deleting_unknown_symbol(basic_store, symbol):
     df = sample_dataframe()
 
@@ -425,11 +464,10 @@ def test_negative_cases(basic_store, symbol):
     # To stay consistent with arctic this doesn't throw.
     basic_store.delete("does_not_exist")
 
-
     with pytest.raises(NoSuchVersionException):
         basic_store.snapshot("empty_snapshot")
     with pytest.raises(NoSuchVersionException):
-        basic_store.snapshot("empty_snapshot", versions={"non-exist-symbol":0})
+        basic_store.snapshot("empty_snapshot", versions={"non-exist-symbol": 0})
     with pytest.raises(NoDataFoundException):
         basic_store.delete_snapshot("empty_snapshot")
 
@@ -717,8 +755,8 @@ def test_get_info_unsorted_timestamp_index_date_range(basic_store):
         sym,
         pd.DataFrame(
             {"col": [1, 2, 3]},
-            index=[pd.Timestamp("2024-01-01"), pd.Timestamp("2024-01-03"), pd.Timestamp("2024-01-02")]
-        )
+            index=[pd.Timestamp("2024-01-01"), pd.Timestamp("2024-01-03"), pd.Timestamp("2024-01-02")],
+        ),
     )
     info = lib.get_info(sym)
     assert np.isnat(info["date_range"][0])
@@ -812,14 +850,16 @@ def test_empty_ndarr(basic_store):
     basic_store.write(sym, ndarr)
     assert_array_equal(basic_store.read(sym).data, ndarr)
 
+
 # The following restrictions should be checked in the cpp layer's name_validation
-MAX_SYMBOL_SIZE=255
-UNSUPPORTED_S3_CHARS={'*', '<', '>'}
+MAX_SYMBOL_SIZE = 255
+UNSUPPORTED_S3_CHARS = {"*", "<", ">"}
+
 
 # See AN-765 for why we need no_symbol_list fixture
 def test_large_symbols(basic_store_no_symbol_list):
     # TODO: Make too long name on LMDB raise a friendlier UserInputException (instead of InternalException [E_INVALID_ARGUMENT])
-    with pytest.raises( (UserInputException, InternalException) ):
+    with pytest.raises((UserInputException, InternalException)):
         basic_store_no_symbol_list.write("a" * (MAX_SYMBOL_SIZE + 1), 1)
 
     for _ in range(5):
@@ -1435,8 +1475,8 @@ def test_batch_read_metadata_missing_keys(basic_store):
     lib_tool.remove(s2_key_to_delete)
 
     vits = lib.batch_read_metadata(["s2"], [1])
-    metadata = vits['s2'].metadata
-    assert metadata['s2'] == "more_metadata"
+    metadata = vits["s2"].metadata
+    assert metadata["s2"] == "more_metadata"
 
     with pytest.raises(StorageException):
         _ = lib.batch_read_metadata(["s1"], [None])
@@ -1665,10 +1705,10 @@ def test_find_version(lmdb_version_store_v1):
         lib.write(sym, 3)
 
     # Latest
-    #assert lib._find_version(sym).version == 3
+    # assert lib._find_version(sym).version == 3
     # By version number
-    #assert lib._find_version(sym, as_of=0).version == 0
-    #assert lib._find_version(sym, as_of=1).version == 1
+    # assert lib._find_version(sym, as_of=0).version == 0
+    # assert lib._find_version(sym, as_of=1).version == 1
     assert lib._find_version(sym, as_of=2) is None
     assert lib._find_version(sym, as_of=3).version == 3
     assert lib._find_version(sym, as_of=1000) is None
@@ -2231,8 +2271,9 @@ def test_batch_read_version_doesnt_exist(basic_store):
     with pytest.raises(NoDataFoundException):
         _ = basic_store.batch_read([sym1, sym2], as_ofs=[0, 1])
 
+
 def test_read_batch_deleted_version_doesnt_exist(basic_store):
-    sym1 = 'mysymbol'
+    sym1 = "mysymbol"
     basic_store.write(sym1, 0)
 
     basic_store.delete(sym1)
@@ -2241,7 +2282,8 @@ def test_read_batch_deleted_version_doesnt_exist(basic_store):
         basic_store.read(sym1, as_of=0)
 
     with pytest.raises(NoSuchVersionException):
-        basic_store.batch_read([sym1],  as_ofs=[0])
+        basic_store.batch_read([sym1], as_ofs=[0])
+
 
 def test_index_keys_start_end_index(basic_store, sym):
     idx = pd.date_range("2022-01-01", periods=100, freq="D")
@@ -2410,6 +2452,7 @@ def test_diff_long_stream_descriptor_mismatch(basic_store, method, num):
             if i % 20 == 4:
                 assert f"FD<name=col{i}, type=TD<type=UTF" in msg
 
+
 def test_wrong_df_col_order(basic_store):
     lib = basic_store
 

diff --git a/python/tests/unit/arcticdb/version_store/test_append.py b/python/tests/unit/arcticdb/version_store/test_append.py
@@ -8,7 +8,11 @@
 
 from pandas import MultiIndex
 from arcticdb.version_store import NativeVersionStore
-from arcticdb_ext.exceptions import InternalException, NormalizationException, SortingException
+from arcticdb_ext.exceptions import (
+    InternalException,
+    NormalizationException,
+    SortingException,
+)
 from arcticdb_ext import set_config_int
 from arcticdb.util.test import random_integers, assert_frame_equal
 from arcticdb.config import set_log_level
@@ -28,6 +32,28 @@ def test_append_simple(lmdb_version_store):
     assert_frame_equal(vit.data, expected)
 
 
+def test_append_unicode(lmdb_version_store):
+    symbol = "test_append_unicode"
+    uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
+
+    df1 = pd.DataFrame(
+        index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
+        data={"a": ["123", uc]},
+    )
+    lmdb_version_store.write(symbol, df1)
+    vit = lmdb_version_store.read(symbol)
+    assert_frame_equal(vit.data, df1)
+
+    df2 = pd.DataFrame(
+        index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
+        data={"a": ["123", uc]},
+    )
+    lmdb_version_store.append(symbol, df2)
+    vit = lmdb_version_store.read(symbol)
+    expected = pd.concat([df1, df2])
+    assert_frame_equal(vit.data, expected)
+
+
 @pytest.mark.parametrize("empty_types", (True, False))
 @pytest.mark.parametrize("dynamic_schema", (True, False))
 def test_append_range_index(version_store_factory, empty_types, dynamic_schema):
@@ -44,7 +70,11 @@ def test_append_range_index(version_store_factory, empty_types, dynamic_schema):
     assert_frame_equal(expected, received)
 
     # Appending a range starting earlier or later, or with a different step size, should fail
-    for idx in [pd.RangeIndex(6, 10, 2), pd.RangeIndex(10, 14, 2), pd.RangeIndex(8, 14, 3)]:
+    for idx in [
+        pd.RangeIndex(6, 10, 2),
+        pd.RangeIndex(10, 14, 2),
+        pd.RangeIndex(8, 14, 3),
+    ]:
         with pytest.raises(NormalizationException):
             lib.append(sym, pd.DataFrame({"col": [4, 5]}, index=idx))
 
@@ -86,7 +116,10 @@ def test_append_string_of_different_sizes(lmdb_version_store):
     vit = lmdb_version_store.read(symbol)
     assert_frame_equal(vit.data, df1)
 
-    df2 = pd.DataFrame(data={"x": ["catandsomethingelse", "dogandsomethingevenlonger"]}, index=np.arange(2, 4))
+    df2 = pd.DataFrame(
+        data={"x": ["catandsomethingelse", "dogandsomethingevenlonger"]},
+        index=np.arange(2, 4),
+    )
     lmdb_version_store.append(symbol, df2)
     vit = lmdb_version_store.read(symbol)
     expected = pd.concat([df1, df2])
@@ -142,7 +175,9 @@ def _random_integers(size, dtype):
     platform_int_info = np.iinfo("int_")
     iinfo = np.iinfo(dtype)
     return np.random.randint(
-        max(iinfo.min, platform_int_info.min), min(iinfo.max, platform_int_info.max), size=size
+        max(iinfo.min, platform_int_info.min),
+        min(iinfo.max, platform_int_info.max),
+        size=size,
     ).astype(dtype)
 
 
@@ -161,7 +196,11 @@ def test_append_out_of_order_and_sort(lmdb_version_store_ignore_order, prune_pre
     num_rows = 1111
     dtidx = pd.date_range("1970-01-01", periods=num_rows)
     test = pd.DataFrame(
-        {"uint8": _random_integers(num_rows, np.uint8), "uint32": _random_integers(num_rows, np.uint32)}, index=dtidx
+        {
+            "uint8": _random_integers(num_rows, np.uint8),
+            "uint32": _random_integers(num_rows, np.uint32),
+        },
+        index=dtidx,
     )
     chunk_size = 100
     list_df = [test[i : i + chunk_size] for i in range(0, test.shape[0], chunk_size)]
@@ -185,7 +224,10 @@ def test_append_out_of_order_and_sort(lmdb_version_store_ignore_order, prune_pre
     else:
         assert len(versions) == len(list_df) + 1
         for version in sorted(versions)[:-1]:
-            assert_frame_equal(lmdb_version_store_ignore_order.read(symbol, as_of=version).data, pd.concat(list_df[0 : version+1]))
+            assert_frame_equal(
+                lmdb_version_store_ignore_order.read(symbol, as_of=version).data,
+                pd.concat(list_df[0 : version + 1]),
+            )
 
 
 def test_upsert_with_delete(lmdb_version_store_big_map):
@@ -197,7 +239,11 @@ def test_upsert_with_delete(lmdb_version_store_big_map):
     num_rows = 1111
     dtidx = pd.date_range("1970-01-01", periods=num_rows)
     test = pd.DataFrame(
-        {"uint8": _random_integers(num_rows, np.uint8), "uint32": _random_integers(num_rows, np.uint32)}, index=dtidx
+        {
+            "uint8": _random_integers(num_rows, np.uint8),
+            "uint32": _random_integers(num_rows, np.uint32),
+        },
+        index=dtidx,
     )
     chunk_size = 100
     list_df = [test[i : i + chunk_size] for i in range(0, test.shape[0], chunk_size)]
@@ -576,7 +622,11 @@ def test_defragment_read_prev_versions(sym, lmdb_version_store, prune_previous_v
         update_start = end_time + pd.to_timedelta(idx, "days")
         update_end = update_start + pd.to_timedelta(10, "days")
         update_index = pd.date_range(update_start, update_end, freq="D")
-        update_df = pd.DataFrame(np.random.randn(len(update_index), len(cols)), index=update_index, columns=cols)
+        update_df = pd.DataFrame(
+            np.random.randn(len(update_index), len(cols)),
+            index=update_index,
+            columns=cols,
+        )
         lmdb_version_store.update(sym, update_df)
         next_expected_df = expected_dfs[-1].reindex(expected_dfs[-1].index.union(update_df.index))
         next_expected_df.loc[update_df.index] = update_df

diff --git a/python/tests/unit/arcticdb/version_store/test_update.py b/python/tests/unit/arcticdb/version_store/test_update.py
@@ -5,6 +5,7 @@
 
 As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
 """
+
 import pandas as pd
 import numpy as np
 import pytest
@@ -50,6 +51,38 @@ def test_update(version_store_factory):
     assert_frame_equal(vit.data, df)
 
 
+def test_update_unicode(lmdb_version_store):
+    symbol = "test_append_unicode"
+    uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
+
+    df1 = pd.DataFrame(
+        index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
+        data={"a": ["123", uc]},
+    )
+    lmdb_version_store.update(symbol, df1, upsert=True)
+    vit = lmdb_version_store.read(symbol)
+    assert_frame_equal(vit.data, df1)
+
+    df2 = pd.DataFrame(
+        index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
+        data={"a": ["123", uc]},
+    )
+    lmdb_version_store.update(symbol, df2)
+    vit = lmdb_version_store.read(symbol)
+    expected = pd.concat([df1, df2])
+    assert_frame_equal(vit.data, expected)
+
+    uc_new = "\u0420\u043e\u0441\u0441\u0438\u044f_new"
+    df1_new = pd.DataFrame(
+        index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
+        data={"a": ["123", uc_new]},
+    )
+    lmdb_version_store.update(symbol, df1_new)
+    vit = lmdb_version_store.read(symbol)
+    expected = pd.concat([df1_new, df2])
+    assert_frame_equal(vit.data, expected)
+
+
 def test_update_long_strides(s3_version_store):
     lib = s3_version_store
     symbol = "test_update_long_strides"
@@ -457,10 +490,10 @@ def _create_product_candles_df(arr):
 @pytest.mark.parametrize("update_df_sortedness", ("ASCENDING", "DESCENDING", "UNSORTED"))
 @pytest.mark.parametrize("date_range_arg_provided", (True, False))
 def test_update_sortedness_checks(
-        lmdb_version_store,
-        existing_df_sortedness,
-        update_df_sortedness,
-        date_range_arg_provided,
+    lmdb_version_store,
+    existing_df_sortedness,
+    update_df_sortedness,
+    date_range_arg_provided,
 ):
     lib = lmdb_version_store
     symbol = "test_update_sortedness_checks"