Skip to content

Commit

Permalink
Add tests reading/writing/appending/updating unicode strings (#1599)
Browse files Browse the repository at this point in the history
#### Reference Issues/PRs
Adds tests for PR #1559 

#### What does this implement or fix?
Adds tests from appending and updating dfs with unicode strings.
Also adds tests for the other similar operations (some like
reading/writing are folded into the append tests)

#### Any other comments?

#### Checklist

<details>
  <summary>
   Checklist for code changes...
  </summary>
 
- [ ] Have you updated the relevant docstrings, documentation and
copyright notice?
- [ ] Is this contribution tested against [all ArcticDB's
features](../docs/mkdocs/docs/technical/contributing.md)?
- [ ] Do all exceptions introduced raise appropriate [error
messages](https://docs.arcticdb.io/error_messages/)?
 - [ ] Are API changes highlighted in the PR description?
- [ ] Is the PR labelled as enhancement or bug so it appears in
autogenerated release notes?
</details>

<!--
Thanks for contributing a Pull Request to ArcticDB! Please ensure you
have taken a look at:
- ArcticDB's Code of Conduct:
https://github.com/man-group/ArcticDB/blob/master/CODE_OF_CONDUCT.md
- ArcticDB's Contribution Licensing:
https://github.com/man-group/ArcticDB/blob/master/docs/mkdocs/docs/technical/contributing.md#contribution-licensing
-->
  • Loading branch information
G-D-Petrov authored Jun 5, 2024
1 parent bf1d788 commit 59bf687
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
"""

import itertools
import time
import sys
Expand Down Expand Up @@ -121,7 +122,7 @@ def test_s3_breaking_chars_exception_compat(object_version_store):
@pytest.mark.parametrize("suffix", ["", "suffix"])
def test_symbol_names_with_all_chars(object_version_store, prefix, suffix):
# Create symbol names with each character (except '\' because Azure replaces it with '/' in some cases)
names = [f"{prefix}{chr(i)}{suffix}" for i in range(256) if chr(i) != '\\']
names = [f"{prefix}{chr(i)}{suffix}" for i in range(256) if chr(i) != "\\"]
df = sample_dataframe()

written_symbols = set()
Expand Down Expand Up @@ -317,8 +318,7 @@ def test_prune_previous_versions_multiple_times(basic_store, symbol):


def test_prune_previous_versions_write_batch(basic_store):
"""Verify that the batch write method correctly prunes previous versions when the corresponding option is specified.
"""
"""Verify that the batch write method correctly prunes previous versions when the corresponding option is specified."""
# Given
lib = basic_store
lib_tool = lib.library_tool()
Expand Down Expand Up @@ -348,8 +348,7 @@ def test_prune_previous_versions_write_batch(basic_store):


def test_prune_previous_versions_batch_write_metadata(basic_store):
"""Verify that the batch write metadata method correctly prunes previous versions when the corresponding option is specified.
"""
"""Verify that the batch write metadata method correctly prunes previous versions when the corresponding option is specified."""
# Given
lib = basic_store
lib_tool = lib.library_tool()
Expand Down Expand Up @@ -379,8 +378,7 @@ def test_prune_previous_versions_batch_write_metadata(basic_store):


def test_prune_previous_versions_append_batch(basic_store):
"""Verify that the batch append method correctly prunes previous versions when the corresponding option is specified.
"""
"""Verify that the batch append method correctly prunes previous versions when the corresponding option is specified."""
# Given
lib = basic_store
lib_tool = lib.library_tool()
Expand Down Expand Up @@ -409,6 +407,47 @@ def test_prune_previous_versions_append_batch(basic_store):
assert len(lib_tool.find_keys(KeyType.SYMBOL_LIST)) == 4


def test_batch_append_unicode(basic_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
basic_store.batch_write(symbols=[symbol], data_vector=[df1])
vit = basic_store.batch_read([symbol])[symbol]
assert_frame_equal(vit.data, df1)

df2 = pd.DataFrame(
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
data={"a": ["123", uc]},
)
basic_store.batch_append(symbols=[symbol], data_vector=[df2])
vit = basic_store.batch_read([symbol])[symbol]
expected = pd.concat([df1, df2])
assert_frame_equal(vit.data, expected)


def test_batch_write_metadata_unicode(basic_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)

basic_store.batch_write(symbols=[symbol], data_vector=[df1])
vit = basic_store.batch_read([symbol])[symbol]
assert_frame_equal(vit.data, df1)

meta = {"a": 1, "b": uc}
basic_store.batch_write_metadata(symbols=[symbol], metadata_vector=[meta])
vits = basic_store.batch_read_metadata([symbol])
metadata = vits[symbol].metadata
assert metadata == meta


def test_deleting_unknown_symbol(basic_store, symbol):
df = sample_dataframe()

Expand All @@ -425,11 +464,10 @@ def test_negative_cases(basic_store, symbol):
# To stay consistent with arctic this doesn't throw.
basic_store.delete("does_not_exist")


with pytest.raises(NoSuchVersionException):
basic_store.snapshot("empty_snapshot")
with pytest.raises(NoSuchVersionException):
basic_store.snapshot("empty_snapshot", versions={"non-exist-symbol":0})
basic_store.snapshot("empty_snapshot", versions={"non-exist-symbol": 0})
with pytest.raises(NoDataFoundException):
basic_store.delete_snapshot("empty_snapshot")

Expand Down Expand Up @@ -717,8 +755,8 @@ def test_get_info_unsorted_timestamp_index_date_range(basic_store):
sym,
pd.DataFrame(
{"col": [1, 2, 3]},
index=[pd.Timestamp("2024-01-01"), pd.Timestamp("2024-01-03"), pd.Timestamp("2024-01-02")]
)
index=[pd.Timestamp("2024-01-01"), pd.Timestamp("2024-01-03"), pd.Timestamp("2024-01-02")],
),
)
info = lib.get_info(sym)
assert np.isnat(info["date_range"][0])
Expand Down Expand Up @@ -812,14 +850,16 @@ def test_empty_ndarr(basic_store):
basic_store.write(sym, ndarr)
assert_array_equal(basic_store.read(sym).data, ndarr)


# The following restrictions should be checked in the cpp layer's name_validation
MAX_SYMBOL_SIZE=255
UNSUPPORTED_S3_CHARS={'*', '<', '>'}
MAX_SYMBOL_SIZE = 255
UNSUPPORTED_S3_CHARS = {"*", "<", ">"}


# See AN-765 for why we need no_symbol_list fixture
def test_large_symbols(basic_store_no_symbol_list):
# TODO: Make too long name on LMDB raise a friendlier UserInputException (instead of InternalException [E_INVALID_ARGUMENT])
with pytest.raises( (UserInputException, InternalException) ):
with pytest.raises((UserInputException, InternalException)):
basic_store_no_symbol_list.write("a" * (MAX_SYMBOL_SIZE + 1), 1)

for _ in range(5):
Expand Down Expand Up @@ -1435,8 +1475,8 @@ def test_batch_read_metadata_missing_keys(basic_store):
lib_tool.remove(s2_key_to_delete)

vits = lib.batch_read_metadata(["s2"], [1])
metadata = vits['s2'].metadata
assert metadata['s2'] == "more_metadata"
metadata = vits["s2"].metadata
assert metadata["s2"] == "more_metadata"

with pytest.raises(StorageException):
_ = lib.batch_read_metadata(["s1"], [None])
Expand Down Expand Up @@ -1665,10 +1705,10 @@ def test_find_version(lmdb_version_store_v1):
lib.write(sym, 3)

# Latest
#assert lib._find_version(sym).version == 3
# assert lib._find_version(sym).version == 3
# By version number
#assert lib._find_version(sym, as_of=0).version == 0
#assert lib._find_version(sym, as_of=1).version == 1
# assert lib._find_version(sym, as_of=0).version == 0
# assert lib._find_version(sym, as_of=1).version == 1
assert lib._find_version(sym, as_of=2) is None
assert lib._find_version(sym, as_of=3).version == 3
assert lib._find_version(sym, as_of=1000) is None
Expand Down Expand Up @@ -2231,8 +2271,9 @@ def test_batch_read_version_doesnt_exist(basic_store):
with pytest.raises(NoDataFoundException):
_ = basic_store.batch_read([sym1, sym2], as_ofs=[0, 1])


def test_read_batch_deleted_version_doesnt_exist(basic_store):
sym1 = 'mysymbol'
sym1 = "mysymbol"
basic_store.write(sym1, 0)

basic_store.delete(sym1)
Expand All @@ -2241,7 +2282,8 @@ def test_read_batch_deleted_version_doesnt_exist(basic_store):
basic_store.read(sym1, as_of=0)

with pytest.raises(NoSuchVersionException):
basic_store.batch_read([sym1], as_ofs=[0])
basic_store.batch_read([sym1], as_ofs=[0])


def test_index_keys_start_end_index(basic_store, sym):
idx = pd.date_range("2022-01-01", periods=100, freq="D")
Expand Down Expand Up @@ -2410,6 +2452,7 @@ def test_diff_long_stream_descriptor_mismatch(basic_store, method, num):
if i % 20 == 4:
assert f"FD<name=col{i}, type=TD<type=UTF" in msg


def test_wrong_df_col_order(basic_store):
lib = basic_store

Expand Down
66 changes: 58 additions & 8 deletions python/tests/unit/arcticdb/version_store/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@

from pandas import MultiIndex
from arcticdb.version_store import NativeVersionStore
from arcticdb_ext.exceptions import InternalException, NormalizationException, SortingException
from arcticdb_ext.exceptions import (
InternalException,
NormalizationException,
SortingException,
)
from arcticdb_ext import set_config_int
from arcticdb.util.test import random_integers, assert_frame_equal
from arcticdb.config import set_log_level
Expand All @@ -28,6 +32,28 @@ def test_append_simple(lmdb_version_store):
assert_frame_equal(vit.data, expected)


def test_append_unicode(lmdb_version_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
lmdb_version_store.write(symbol, df1)
vit = lmdb_version_store.read(symbol)
assert_frame_equal(vit.data, df1)

df2 = pd.DataFrame(
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
data={"a": ["123", uc]},
)
lmdb_version_store.append(symbol, df2)
vit = lmdb_version_store.read(symbol)
expected = pd.concat([df1, df2])
assert_frame_equal(vit.data, expected)


@pytest.mark.parametrize("empty_types", (True, False))
@pytest.mark.parametrize("dynamic_schema", (True, False))
def test_append_range_index(version_store_factory, empty_types, dynamic_schema):
Expand All @@ -44,7 +70,11 @@ def test_append_range_index(version_store_factory, empty_types, dynamic_schema):
assert_frame_equal(expected, received)

# Appending a range starting earlier or later, or with a different step size, should fail
for idx in [pd.RangeIndex(6, 10, 2), pd.RangeIndex(10, 14, 2), pd.RangeIndex(8, 14, 3)]:
for idx in [
pd.RangeIndex(6, 10, 2),
pd.RangeIndex(10, 14, 2),
pd.RangeIndex(8, 14, 3),
]:
with pytest.raises(NormalizationException):
lib.append(sym, pd.DataFrame({"col": [4, 5]}, index=idx))

Expand Down Expand Up @@ -86,7 +116,10 @@ def test_append_string_of_different_sizes(lmdb_version_store):
vit = lmdb_version_store.read(symbol)
assert_frame_equal(vit.data, df1)

df2 = pd.DataFrame(data={"x": ["catandsomethingelse", "dogandsomethingevenlonger"]}, index=np.arange(2, 4))
df2 = pd.DataFrame(
data={"x": ["catandsomethingelse", "dogandsomethingevenlonger"]},
index=np.arange(2, 4),
)
lmdb_version_store.append(symbol, df2)
vit = lmdb_version_store.read(symbol)
expected = pd.concat([df1, df2])
Expand Down Expand Up @@ -142,7 +175,9 @@ def _random_integers(size, dtype):
platform_int_info = np.iinfo("int_")
iinfo = np.iinfo(dtype)
return np.random.randint(
max(iinfo.min, platform_int_info.min), min(iinfo.max, platform_int_info.max), size=size
max(iinfo.min, platform_int_info.min),
min(iinfo.max, platform_int_info.max),
size=size,
).astype(dtype)


Expand All @@ -161,7 +196,11 @@ def test_append_out_of_order_and_sort(lmdb_version_store_ignore_order, prune_pre
num_rows = 1111
dtidx = pd.date_range("1970-01-01", periods=num_rows)
test = pd.DataFrame(
{"uint8": _random_integers(num_rows, np.uint8), "uint32": _random_integers(num_rows, np.uint32)}, index=dtidx
{
"uint8": _random_integers(num_rows, np.uint8),
"uint32": _random_integers(num_rows, np.uint32),
},
index=dtidx,
)
chunk_size = 100
list_df = [test[i : i + chunk_size] for i in range(0, test.shape[0], chunk_size)]
Expand All @@ -185,7 +224,10 @@ def test_append_out_of_order_and_sort(lmdb_version_store_ignore_order, prune_pre
else:
assert len(versions) == len(list_df) + 1
for version in sorted(versions)[:-1]:
assert_frame_equal(lmdb_version_store_ignore_order.read(symbol, as_of=version).data, pd.concat(list_df[0 : version+1]))
assert_frame_equal(
lmdb_version_store_ignore_order.read(symbol, as_of=version).data,
pd.concat(list_df[0 : version + 1]),
)


def test_upsert_with_delete(lmdb_version_store_big_map):
Expand All @@ -197,7 +239,11 @@ def test_upsert_with_delete(lmdb_version_store_big_map):
num_rows = 1111
dtidx = pd.date_range("1970-01-01", periods=num_rows)
test = pd.DataFrame(
{"uint8": _random_integers(num_rows, np.uint8), "uint32": _random_integers(num_rows, np.uint32)}, index=dtidx
{
"uint8": _random_integers(num_rows, np.uint8),
"uint32": _random_integers(num_rows, np.uint32),
},
index=dtidx,
)
chunk_size = 100
list_df = [test[i : i + chunk_size] for i in range(0, test.shape[0], chunk_size)]
Expand Down Expand Up @@ -576,7 +622,11 @@ def test_defragment_read_prev_versions(sym, lmdb_version_store, prune_previous_v
update_start = end_time + pd.to_timedelta(idx, "days")
update_end = update_start + pd.to_timedelta(10, "days")
update_index = pd.date_range(update_start, update_end, freq="D")
update_df = pd.DataFrame(np.random.randn(len(update_index), len(cols)), index=update_index, columns=cols)
update_df = pd.DataFrame(
np.random.randn(len(update_index), len(cols)),
index=update_index,
columns=cols,
)
lmdb_version_store.update(sym, update_df)
next_expected_df = expected_dfs[-1].reindex(expected_dfs[-1].index.union(update_df.index))
next_expected_df.loc[update_df.index] = update_df
Expand Down
41 changes: 37 additions & 4 deletions python/tests/unit/arcticdb/version_store/test_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
"""

import pandas as pd
import numpy as np
import pytest
Expand Down Expand Up @@ -50,6 +51,38 @@ def test_update(version_store_factory):
assert_frame_equal(vit.data, df)


def test_update_unicode(lmdb_version_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
lmdb_version_store.update(symbol, df1, upsert=True)
vit = lmdb_version_store.read(symbol)
assert_frame_equal(vit.data, df1)

df2 = pd.DataFrame(
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
data={"a": ["123", uc]},
)
lmdb_version_store.update(symbol, df2)
vit = lmdb_version_store.read(symbol)
expected = pd.concat([df1, df2])
assert_frame_equal(vit.data, expected)

uc_new = "\u0420\u043e\u0441\u0441\u0438\u044f_new"
df1_new = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc_new]},
)
lmdb_version_store.update(symbol, df1_new)
vit = lmdb_version_store.read(symbol)
expected = pd.concat([df1_new, df2])
assert_frame_equal(vit.data, expected)


def test_update_long_strides(s3_version_store):
lib = s3_version_store
symbol = "test_update_long_strides"
Expand Down Expand Up @@ -457,10 +490,10 @@ def _create_product_candles_df(arr):
@pytest.mark.parametrize("update_df_sortedness", ("ASCENDING", "DESCENDING", "UNSORTED"))
@pytest.mark.parametrize("date_range_arg_provided", (True, False))
def test_update_sortedness_checks(
lmdb_version_store,
existing_df_sortedness,
update_df_sortedness,
date_range_arg_provided,
lmdb_version_store,
existing_df_sortedness,
update_df_sortedness,
date_range_arg_provided,
):
lib = lmdb_version_store
symbol = "test_update_sortedness_checks"
Expand Down

0 comments on commit 59bf687

Please sign in to comment.