Skip to content

Commit

Permalink
core: speedup writing to cache by using qmark style sql rather than n…
Browse files Browse the repository at this point in the history
…amed

```
before
src/cachew/tests/test_cachew.py::test_many[gc_off-1000000] [INFO    2023-09-13 03:31:29,980 cachew __init__.py:778 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: wrote   1000000 objects to   cachew (sqlite /tmp/pytest-of-karlicos/pytest-18/test_many_gc_off_1000000_0/test_many)
test_many: initial write to cache took 4.5s
test_many: cache size is 72.904704Mb
[INFO    2023-09-13 03:31:30,273 cachew __init__.py:641 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: loading 1000000 objects from cachew (sqlite /tmp/pytest-of-karlicos/pytest-18/test_many_gc_off_1000000_0/test_many)
test_many: reading from cache took 2.9s
```

after:
```
src/cachew/tests/test_cachew.py::test_many[gc_off-1000000] [INFO    2023-09-14 20:50:55,287 cachew __init__.py:791 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: wrote   1000000 objects to   cachew (sqlite /tmp/pytest-of-karlicos/pytest-80/test_many_gc_off_1000000_0/test_many)
test_many: initial write to cache took 3.3s
test_many: cache size is 72.904704Mb
[INFO    2023-09-14 20:50:55,479 cachew __init__.py:654 ] cachew.tests.test_cachew:test_many.<locals>.iter_data: loading 1000000 objects from cachew (sqlite /tmp/pytest-of-karlicos/pytest-80/test_many_gc_off_1000000_0/test_many)
test_many: reading from cache took 2.8s
```
  • Loading branch information
karlicoss committed Sep 14, 2023
1 parent 5792757 commit 4537d39
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 16 deletions.
24 changes: 12 additions & 12 deletions src/cachew/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def orjson_dumps(*args, **kwargs): # type: ignore[misc]
import appdirs
import sqlalchemy
from sqlalchemy import Column, Table, event, text
from sqlalchemy.dialects import sqlite

from .logging_helper import makeLogger
from .marshall.cachew import CachewMarshall, build_schema
Expand Down Expand Up @@ -630,8 +631,8 @@ def cachew_wrapper(

def cached_items():
rows = conn.execute(table_cache.select())
for row in rows:
j = orjson_loads(row[0])
for (blob,) in rows:
j = orjson_loads(blob)
obj = marshall.load(j)
yield obj

Expand Down Expand Up @@ -730,19 +731,18 @@ def missing_keys(cached: List[str], wanted: List[str]) -> Optional[List[str]]:
# at this point we're guaranteed to have an exclusive write transaction

datas = func(*args, **kwargs)
column_names = [c.name for c in table_cache_tmp.columns]
insert_into_table_cache_tmp = table_cache_tmp.insert()
# uhh. this gives a huge speedup for inserting
# since we don't have to create intermediate dictionaries
insert_into_table_cache_tmp_raw = str(table_cache_tmp.insert().compile(dialect=sqlite.dialect(paramstyle='qmark')))
# I also tried setting paramstyle='qmark' in create_engine, but it seems to be ignored :(
# idk what benefit sqlalchemy gives at this point, seems to just complicate things

chunk: List[Any] = []

def flush() -> None:
nonlocal chunk
if len(chunk) > 0:
# TODO optimize this, we really don't need to make extra dicts here just to insert
chunk_dict = [
dict(zip(column_names, row))
for row in chunk
]
conn.execute(insert_into_table_cache_tmp, chunk_dict)
conn.exec_driver_sql(insert_into_table_cache_tmp_raw, [(c,) for c in chunk])
chunk = []

total_objects = 0
Expand All @@ -755,8 +755,8 @@ def flush() -> None:
return

dct = marshall.dump(obj)
j = orjson_dumps(dct)
chunk.append((j,))
blob = orjson_dumps(dct)
chunk.append(blob)
if len(chunk) >= chunk_by:
flush()
flush()
Expand Down
9 changes: 5 additions & 4 deletions src/cachew/tests/test_cachew.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

from .. import cachew, get_logger, NTBinder, CachewException, settings

from .utils import running_on_ci
from .utils import running_on_ci, gc_control


logger = get_logger()
Expand Down Expand Up @@ -284,9 +284,10 @@ class TE2(NamedTuple):

# you can run one specific test (e.g. to profile) by passing it as -k to pytest
# e.g. -k 'test_many[500000-False]'
@pytest.mark.parametrize('count', [100_000, 500_000, 1_000_000])
def test_many(count: int, tmp_path: Path) -> None:
if count > 100_000 and running_on_ci:
@pytest.mark.parametrize('count', [99, 500_000, 1_000_000])
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
def test_many(count: int, tmp_path: Path, gc_control) -> None:
if count > 99 and running_on_ci:
pytest.skip("test would be too slow on CI, only meant to run manually")
# should be a parametrized test perhaps
src = tmp_path / 'source'
Expand Down

0 comments on commit 4537d39

Please sign in to comment.