From 0dbbaa899e31a2ce9c30684ae99b4ede206189af Mon Sep 17 00:00:00 2001 From: fjetter Date: Tue, 8 Aug 2023 12:54:12 +0200 Subject: [PATCH 1/5] Remove dumps_task --- distributed/core.py | 18 --- distributed/recreate_tasks.py | 15 +-- distributed/scheduler.py | 32 +++-- distributed/tests/test_scheduler.py | 28 +---- distributed/tests/test_worker.py | 5 +- .../tests/test_worker_state_machine.py | 23 ++-- distributed/utils_test.py | 7 -- distributed/worker.py | 119 ++---------------- distributed/worker_state_machine.py | 58 ++------- 9 files changed, 59 insertions(+), 246 deletions(-) diff --git a/distributed/core.py b/distributed/core.py index b17177e280..df6d3cb8e9 100644 --- a/distributed/core.py +++ b/distributed/core.py @@ -26,7 +26,6 @@ from dask.utils import parse_timedelta from distributed import profile, protocol -from distributed.collections import LRU from distributed.comm import ( Comm, CommClosedError, @@ -40,7 +39,6 @@ from distributed.counter import Counter from distributed.diskutils import WorkDir, WorkSpace from distributed.metrics import context_meter, time -from distributed.protocol import pickle from distributed.system_monitor import SystemMonitor from distributed.utils import ( NoOpAwaitable, @@ -64,21 +62,6 @@ Coro = Coroutine[Any, Any, T] -cache_loads: LRU[bytes, Callable[..., Any]] = LRU(maxsize=100) - - -def loads_function(bytes_object): - """Load a function from bytes, cache bytes""" - if len(bytes_object) < 100000: - try: - result = cache_loads[bytes_object] - except KeyError: - result = pickle.loads(bytes_object) - cache_loads[bytes_object] = result - return result - return pickle.loads(bytes_object) - - class Status(Enum): """ This Enum contains the various states a cluster, worker, scheduler and nanny can be @@ -519,7 +502,6 @@ def func(data): if load: try: import_file(out_filename) - cache_loads.data.clear() except Exception as e: logger.exception(e) raise e diff --git a/distributed/recreate_tasks.py b/distributed/recreate_tasks.py index a8351dad70..d76aaaf566 100644 --- a/distributed/recreate_tasks.py +++ b/distributed/recreate_tasks.py @@ -5,9 +5,9 @@ from dask.utils import stringify from distributed.client import futures_of, wait +from distributed.protocol.serialize import ToPickle from distributed.utils import sync from distributed.utils_comm import pack_data -from distributed.worker import _deserialize logger = logging.getLogger(__name__) @@ -42,7 +42,10 @@ def get_error_cause(self, *args, keys=(), **kwargs): def get_runspec(self, *args, key=None, **kwargs): key = self._process_key(key) ts = self.scheduler.tasks.get(key) - return {"task": ts.run_spec, "deps": [dts.key for dts in ts.dependencies]} + return { + "task": ToPickle(ts.run_spec), + "deps": [dts.key for dts in ts.dependencies], + } class ReplayTaskClient: @@ -83,13 +86,7 @@ async def _get_raw_components_from_future(self, future): await wait(future) key = future.key spec = await self.scheduler.get_runspec(key=key) - deps, task = spec["deps"], spec["task"] - if isinstance(task, dict): - function, args, kwargs = _deserialize(**task) - return (function, args, kwargs, deps) - else: - function, args, kwargs = _deserialize(task=task) - return (function, args, kwargs, deps) + return (*spec["task"], spec["deps"]) async def _prepare_raw_components(self, raw_components): """ diff --git a/distributed/scheduler.py b/distributed/scheduler.py index 9e1b16fd76..36d771fc81 100644 --- a/distributed/scheduler.py +++ b/distributed/scheduler.py @@ -126,7 +126,7 @@ ) from distributed.utils_perf import disable_gc_diagnosis, enable_gc_diagnosis from distributed.variable import VariableExtension -from distributed.worker import dumps_task +from distributed.worker import _normalize_task if TYPE_CHECKING: # TODO import from typing (requires Python >=3.10) @@ -156,6 +156,8 @@ # (recommendations, client messages, worker messages) RecsMsgs: TypeAlias = tuple[Recs, Msgs, Msgs] +T_runspec: TypeAlias = tuple[Callable, tuple, dict[str, Any]] + logger = logging.getLogger(__name__) LOG_PDB = dask.config.get("distributed.admin.pdb-on-err") DEFAULT_DATA_SIZE = parse_bytes( @@ -1176,7 +1178,7 @@ class TaskState: #: "pure data" (such as, for example, a piece of data loaded in the scheduler using #: :meth:`Client.scatter`). A "pure data" task cannot be computed again if its #: value is lost. - run_spec: object + run_spec: T_runspec | None #: The priority provides each task with a relative ranking which is used to break #: ties when many tasks are being considered for execution. @@ -1375,7 +1377,7 @@ class TaskState: def __init__( self, key: str, - run_spec: object, + run_spec: T_runspec | None, state: TaskStateState, ): self.key = key @@ -1787,7 +1789,7 @@ def __pdict__(self) -> dict[str, Any]: def new_task( self, key: str, - spec: object, + spec: T_runspec | None, state: TaskStateState, computation: Computation | None = None, ) -> TaskState: @@ -3343,10 +3345,7 @@ def _task_to_msg(self, ts: TaskState, duration: float = -1) -> dict[str, Any]: dts.key: [ws.address for ws in dts.who_has] for dts in ts.dependencies }, "nbytes": {dts.key: dts.nbytes for dts in ts.dependencies}, - "run_spec": None, - "function": None, - "args": None, - "kwargs": None, + "run_spec": ToPickle(ts.run_spec), "resource_restrictions": ts.resource_restrictions, "actor": ts.actor, "annotations": ts.annotations, @@ -3355,11 +3354,6 @@ def _task_to_msg(self, ts: TaskState, duration: float = -1) -> dict[str, Any]: if self.validate: assert all(msg["who_has"].values()) - if isinstance(ts.run_spec, dict): - msg.update(ts.run_spec) - else: - msg["run_spec"] = ts.run_spec - return msg @@ -4606,7 +4600,11 @@ async def update_graph( self.digest_metric("update-graph-duration", end - start) def _generate_taskstates( - self, keys: set[str], dsk: dict, dependencies: dict, computation: Computation + self, + keys: set[str], + dsk: dict[str, T_runspec], + dependencies: dict[str, set[str]], + computation: Computation, ) -> tuple: # Get or create task states runnable = [] @@ -8479,8 +8477,8 @@ def transition( def _materialize_graph( - graph: HighLevelGraph, global_annotations: dict -) -> tuple[dict, dict, dict]: + graph: HighLevelGraph, global_annotations: dict[str, Any] +) -> tuple[dict[str, T_runspec], dict[str, set[str]], dict[str, Any]]: dsk = dask.utils.ensure_dict(graph) annotations_by_type: defaultdict[str, dict[str, Any]] = defaultdict(dict) for annotations_type, value in global_annotations.items(): @@ -8536,6 +8534,6 @@ def _materialize_graph( for k in list(dsk): if dsk[k] is k: del dsk[k] - dsk = valmap(dumps_task, dsk) + dsk = valmap(_normalize_task, dsk) return dsk, dependencies, annotations_by_type diff --git a/distributed/tests/test_scheduler.py b/distributed/tests/test_scheduler.py index af476c0915..04cd6cc798 100644 --- a/distributed/tests/test_scheduler.py +++ b/distributed/tests/test_scheduler.py @@ -24,7 +24,7 @@ import dask from dask import delayed from dask.highlevelgraph import HighLevelGraph, MaterializedLayer -from dask.utils import apply, parse_timedelta, stringify, tmpfile, typename +from dask.utils import parse_timedelta, stringify, tmpfile, typename from distributed import ( CancelledError, @@ -74,7 +74,7 @@ varying, wait_for_state, ) -from distributed.worker import dumps_function, dumps_task, get_worker, secede +from distributed.worker import dumps_function, get_worker, secede pytestmark = pytest.mark.ci1 @@ -953,24 +953,6 @@ def test_dumps_function(): assert a != c -def test_dumps_task(): - d = dumps_task((inc, 1)) - assert set(d) == {"function", "args"} - - def f(x, y=2): - return x + y - - d = dumps_task((apply, f, (1,), {"y": 10})) - assert cloudpickle.loads(d["function"])(1, 2) == 3 - assert cloudpickle.loads(d["args"]) == (1,) - assert cloudpickle.loads(d["kwargs"]) == {"y": 10} - - d = dumps_task((apply, f, (1,))) - assert cloudpickle.loads(d["function"])(1, 2) == 3 - assert cloudpickle.loads(d["args"]) == (1,) - assert set(d) == {"function", "args"} - - @pytest.mark.parametrize("worker_saturation", [1.0, float("inf")]) @gen_cluster(client=True) async def test_ready_remove_worker(c, s, a, b, worker_saturation): @@ -1357,9 +1339,9 @@ async def test_update_graph_culls(s, a, b): layers={ "foo": MaterializedLayer( { - "x": dumps_task((inc, 1)), - "y": dumps_task((inc, "x")), - "z": dumps_task((inc, 2)), + "x": (inc, 1), + "y": (inc, "x"), + "z": (inc, 2), } ) }, diff --git a/distributed/tests/test_worker.py b/distributed/tests/test_worker.py index 29b789618c..6200878da1 100644 --- a/distributed/tests/test_worker.py +++ b/distributed/tests/test_worker.py @@ -95,7 +95,6 @@ ExecuteFailureEvent, ExecuteSuccessEvent, RemoveReplicasEvent, - SerializedTask, StealRequestEvent, ) @@ -2090,7 +2089,7 @@ async def test_stimulus_story(c, s, a): assert isinstance(story[0], ComputeTaskEvent) assert story[0].key == "f1" - assert story[0].run_spec == SerializedTask(task=None) # Not logged + assert story[0].run_spec is None # Not logged assert isinstance(story[1], ExecuteSuccessEvent) assert story[1].key == "f1" @@ -2100,7 +2099,7 @@ async def test_stimulus_story(c, s, a): assert isinstance(story[2], ComputeTaskEvent) assert story[2].key == "f2" assert story[2].who_has == {"f1": (a.address,)} - assert story[2].run_spec == SerializedTask(task=None) # Not logged + assert story[2].run_spec is None # Not logged assert story[2].handled >= story[1].handled assert isinstance(story[3], ExecuteFailureEvent) diff --git a/distributed/tests/test_worker_state_machine.py b/distributed/tests/test_worker_state_machine.py index b06ab6548e..c26ead57e6 100644 --- a/distributed/tests/test_worker_state_machine.py +++ b/distributed/tests/test_worker_state_machine.py @@ -57,7 +57,6 @@ RetryBusyWorkerEvent, RetryBusyWorkerLater, SecedeEvent, - SerializedTask, StateMachineEvent, TaskErredMsg, TaskState, @@ -375,28 +374,27 @@ def test_event_to_dict_without_annotations(): def test_computetask_to_dict(): """The potentially very large ComputeTaskEvent.run_spec is not stored in the log""" + + def f(arg): + pass + ev = ComputeTaskEvent( key="x", who_has={"y": ["w1"]}, nbytes={"y": 123}, priority=(0,), duration=123.45, - run_spec=None, + run_spec=(f, "arg", {}), resource_restrictions={}, actor=False, annotations={}, span_id=None, stimulus_id="test", - function=b"blob", - args=b"blob", - kwargs=None, - run_id=5, ) - assert ev.run_spec == SerializedTask(function=b"blob", args=b"blob") + assert ev.run_spec is not None ev2 = ev.to_loggable(handled=11.22) assert ev2.handled == 11.22 - assert ev2.run_spec == SerializedTask(task=None) - assert ev.run_spec == SerializedTask(function=b"blob", args=b"blob") + assert ev2.run_spec is None d = recursive_to_dict(ev2) assert d == { "cls": "ComputeTaskEvent", @@ -404,7 +402,7 @@ def test_computetask_to_dict(): "who_has": {"y": ["w1"]}, "nbytes": {"y": 123}, "priority": [0], - "run_spec": [None, None, None, None], + "run_spec": [None, None, None], "duration": 123.45, "resource_restrictions": {}, "actor": False, @@ -412,14 +410,11 @@ def test_computetask_to_dict(): "span_id": None, "stimulus_id": "test", "handled": 11.22, - "function": None, - "args": None, - "kwargs": None, "run_id": 5, } ev3 = StateMachineEvent.from_dict(d) assert isinstance(ev3, ComputeTaskEvent) - assert ev3.run_spec == SerializedTask(task=None) + assert ev3.run_spec is None assert ev3.priority == (0,) # List is automatically converted back to tuple diff --git a/distributed/utils_test.py b/distributed/utils_test.py index ee2300c7aa..e7bdb70b0b 100644 --- a/distributed/utils_test.py +++ b/distributed/utils_test.py @@ -2303,13 +2303,6 @@ async def execute(self, key: str, *, stimulus_id: str) -> StateMachineEvent: self.in_execute_exit.set() await self.block_execute_exit.wait() - async def _maybe_deserialize_task( - self, ts: WorkerTaskState - ) -> tuple[Callable, tuple, dict[str, Any]]: - self.in_deserialize_task.set() - await self.block_deserialize_task.wait() - return await super()._maybe_deserialize_task(ts) - @contextmanager def freeze_data_fetching(w: Worker, *, jump_start: bool = False) -> Iterator[None]: diff --git a/distributed/worker.py b/distributed/worker.py index 27dbc5166f..3af7117909 100644 --- a/distributed/worker.py +++ b/distributed/worker.py @@ -66,7 +66,6 @@ from distributed.comm import Comm, connect, get_address_host, parse_address from distributed.comm import resolve_address as comm_resolve_address from distributed.comm.addressing import address_from_user_args -from distributed.comm.utils import OFFLOAD_THRESHOLD from distributed.compatibility import PeriodicCallback from distributed.core import ( ConnectionPool, @@ -75,7 +74,6 @@ coerce_to_address, context_meter_to_server_digest, error_message, - loads_function, pingpong, ) from distributed.core import rpc as RPCType @@ -125,7 +123,6 @@ WorkerMemoryManager, ) from distributed.worker_state_machine import ( - NO_VALUE, AcquireReplicasEvent, BaseWorker, CancelComputeEvent, @@ -162,6 +159,7 @@ from distributed.client import Client from distributed.diagnostics.plugin import WorkerPlugin from distributed.nanny import Nanny + from distributed.scheduler import T_runspec P = ParamSpec("P") T = TypeVar("T") @@ -2226,24 +2224,6 @@ def actor_attribute(self, actor=None, attribute=None) -> dict[str, Any]: except Exception as ex: return {"status": "error", "exception": to_serialize(ex)} - async def _maybe_deserialize_task( - self, ts: TaskState - ) -> tuple[Callable, tuple, dict[str, Any]]: - assert ts.run_spec is not None - start = time() - # Offload deserializing large tasks - if sizeof(ts.run_spec) > OFFLOAD_THRESHOLD: - function, args, kwargs = await offload(_deserialize, *ts.run_spec) - else: - function, args, kwargs = _deserialize(*ts.run_spec) - stop = time() - - if stop - start > 0.010: - ts.startstops.append( - {"action": "deserialize", "start": start, "stop": stop} - ) - return function, args, kwargs - @fail_hard async def execute(self, key: str, *, stimulus_id: str) -> StateMachineEvent: """Execute a task. Implements BaseWorker abstract method. @@ -2262,23 +2242,13 @@ async def execute(self, key: str, *, stimulus_id: str) -> StateMachineEvent: ts = self.state.tasks[key] run_id = ts.run_id - try: - function, args, kwargs = await self._maybe_deserialize_task(ts) - except Exception as exc: - logger.error("Could not deserialize task %s", key, exc_info=True) - return ExecuteFailureEvent.from_exception( - exc, - key=key, - run_id=run_id, - stimulus_id=f"run-spec-deserialize-failed-{time()}", - ) - try: if self.state.validate: assert not ts.waiting_for_data assert ts.state in ("executing", "cancelled", "resumed"), ts - assert ts.run_spec is not None + assert ts.run_spec is not None + function, args, kwargs = ts.run_spec args2, kwargs2 = self._prepare_args_for_execution(ts, args, kwargs) assert ts.annotations is not None @@ -2947,27 +2917,14 @@ async def get_data_from_worker( rpc.reuse(worker, comm) -job_counter = [0] - - -@context_meter.meter("deserialize") -def _deserialize(function=None, args=None, kwargs=None, task=NO_VALUE): - """Deserialize task inputs and regularize to func, args, kwargs""" - # Some objects require threadlocal state during deserialization, e.g. to - # detect the current worker - if function is not None: - function = loads_function(function) - if args and isinstance(args, bytes): - args = pickle.loads(args) - if kwargs and isinstance(kwargs, bytes): - kwargs = pickle.loads(kwargs) - - if task is not NO_VALUE: - assert not function and not args and not kwargs - function = execute_task - args = (task,) +def _normalize_task(task: Any) -> T_runspec: + if istask(task): + if task[0] is apply and not any(map(_maybe_complex, task[2:])): + return task[1], task[2], task[3] if len(task) == 4 else {} + elif not any(map(_maybe_complex, task[1:])): + return task[0], task[1:], {} - return function, args or (), kwargs or {} + return execute_task, (task,), {} def execute_task(task): @@ -3008,62 +2965,6 @@ def dumps_function(func) -> bytes: return result -def dumps_task(task): - """Serialize a dask task - - Returns a dict of bytestrings that can each be loaded with ``loads`` - - Examples - -------- - Either returns a task as a function, args, kwargs dict - - >>> from operator import add - >>> dumps_task((add, 1)) # doctest: +SKIP - {'function': b'\x80\x04\x95\x00\x8c\t_operator\x94\x8c\x03add\x94\x93\x94.' - 'args': b'\x80\x04\x95\x07\x00\x00\x00K\x01K\x02\x86\x94.'} - - Or as a single task blob if it can't easily decompose the result. This - happens either if the task is highly nested, or if it isn't a task at all - - >>> dumps_task(1) # doctest: +SKIP - {'task': b'\x80\x04\x95\x03\x00\x00\x00\x00\x00\x00\x00K\x01.'} - """ - if istask(task): - if task[0] is apply and not any(map(_maybe_complex, task[2:])): - d = {"function": dumps_function(task[1]), "args": warn_dumps(task[2])} - if len(task) == 4: - d["kwargs"] = warn_dumps(task[3]) - return d - elif not any(map(_maybe_complex, task[1:])): - return {"function": dumps_function(task[0]), "args": warn_dumps(task[1:])} - return to_serialize(task) - - -_warn_dumps_warned = [False] - - -def warn_dumps(obj, dumps=pickle.dumps, limit=1e6): - """Dump an object to bytes, warn if those bytes are large""" - b = dumps(obj) - if not _warn_dumps_warned[0] and len(b) > limit: - _warn_dumps_warned[0] = True - s = str(obj) - if len(s) > 70: - s = s[:50] + " ... " + s[-15:] - warnings.warn( - "Large object of size %s detected in task graph: \n" - " %s\n" - "Consider scattering large objects ahead of time\n" - "with client.scatter to reduce scheduler burden and \n" - "keep data on workers\n\n" - " future = client.submit(func, big_data) # bad\n\n" - " big_future = client.scatter(big_data) # good\n" - " future = client.submit(func, big_future) # good" - % (format_bytes(len(b)), s) - ) - return b - - def apply_function( function, args, diff --git a/distributed/worker_state_machine.py b/distributed/worker_state_machine.py index 985db35d24..279156a6b0 100644 --- a/distributed/worker_state_machine.py +++ b/distributed/worker_state_machine.py @@ -27,16 +27,7 @@ from dataclasses import dataclass, field from functools import lru_cache, partial, singledispatchmethod, wraps from itertools import chain -from typing import ( - TYPE_CHECKING, - Any, - ClassVar, - Literal, - NamedTuple, - TypedDict, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypedDict, Union, cast from tlz import peekn @@ -49,7 +40,7 @@ from distributed.core import ErrorMessage, error_message from distributed.metrics import DelayedMetricsLedger, monotonic, time from distributed.protocol import pickle -from distributed.protocol.serialize import Serialize +from distributed.protocol.serialize import Serialize, ToPickle from distributed.sizeof import safe_sizeof as sizeof from distributed.utils import recursive_to_dict @@ -64,6 +55,7 @@ # Circular imports from distributed.diagnostics.plugin import WorkerPlugin + from distributed.scheduler import T_runspec from distributed.worker import Worker # Not to be confused with distributed.scheduler.TaskStateState @@ -112,19 +104,6 @@ RUN_ID_SENTINEL = -1 -class SerializedTask(NamedTuple): - """Info from distributed.scheduler.TaskState.run_spec - Input to distributed.worker._deserialize - - (function, args kwargs) and task are mutually exclusive - """ - - function: bytes | None = None - args: bytes | tuple | list | None = None - kwargs: bytes | dict[str, Any] | None = None - task: object = NO_VALUE - - class StartStop(TypedDict): action: Literal["compute", "transfer", "disk-read", "disk-write", "deserialize"] start: float @@ -239,11 +218,11 @@ class TaskState: prefix: str = field(init=False) #: Task run ID. run_id: int = RUN_ID_SENTINEL - #: A named tuple containing the ``function``, ``args``, ``kwargs`` and ``task`` + #: A tuple containing the ``function``, ``args``, ``kwargs`` and ``task`` #: associated with this `TaskState` instance. This defaults to ``None`` and can #: remain empty if it is a dependency that this worker will receive from another #: worker. - run_spec: SerializedTask | None = None + run_spec: T_runspec | None = None #: The data needed by this key to run dependencies: set[TaskState] = field(default_factory=set) @@ -763,10 +742,7 @@ class ComputeTaskEvent(StateMachineEvent): nbytes: dict[str, int] priority: tuple[int, ...] duration: float - run_spec: SerializedTask | None - function: bytes | None - args: bytes | tuple | list | None | None - kwargs: bytes | dict[str, Any] | None + run_spec: T_runspec | None resource_restrictions: dict[str, float] actor: bool annotations: dict @@ -778,24 +754,17 @@ def __post_init__(self) -> None: # Fixes after msgpack decode if isinstance(self.priority, list): # type: ignore[unreachable] self.priority = tuple(self.priority) # type: ignore[unreachable] - - if self.function is not None: - assert self.run_spec is None - self.run_spec = SerializedTask( - function=self.function, args=self.args, kwargs=self.kwargs - ) - elif not isinstance(self.run_spec, SerializedTask): - self.run_spec = SerializedTask(task=self.run_spec) + if isinstance(self.run_spec, ToPickle): + # FIXME Sometimes the protocol is not unpacking this + # E.g. distributed/tests/test_client.py::test_async_with + self.run_spec = self.run_spec.data # type: ignore[unreachable] def _to_dict(self, *, exclude: Container[str] = ()) -> dict: return StateMachineEvent._to_dict(self._clean(), exclude=exclude) def _clean(self) -> StateMachineEvent: out = copy(self) - out.function = None - out.kwargs = None - out.args = None - out.run_spec = SerializedTask(task=None, function=None, args=None, kwargs=None) + out.run_spec = None return out def to_loggable(self, *, handled: float) -> StateMachineEvent: @@ -804,7 +773,7 @@ def to_loggable(self, *, handled: float) -> StateMachineEvent: return out def _after_from_dict(self) -> None: - self.run_spec = SerializedTask(task=None, function=None, args=None, kwargs=None) + self.run_spec = None @staticmethod def dummy( @@ -831,9 +800,6 @@ def dummy( priority=priority, duration=duration, run_spec=None, - function=None, - args=None, - kwargs=None, resource_restrictions=resource_restrictions or {}, actor=actor, annotations=annotations or {}, From 3cf1465bd0a1144353e398424f64f65e4728dc76 Mon Sep 17 00:00:00 2001 From: fjetter Date: Tue, 8 Aug 2023 15:25:52 +0200 Subject: [PATCH 2/5] more fixes --- distributed/tests/test_scheduler.py | 50 ++++++++++--------- distributed/tests/test_worker.py | 38 +------------- distributed/tests/test_worker_metrics.py | 16 ------ .../tests/test_worker_state_machine.py | 5 +- distributed/worker_state_machine.py | 10 +++- 5 files changed, 39 insertions(+), 80 deletions(-) diff --git a/distributed/tests/test_scheduler.py b/distributed/tests/test_scheduler.py index 04cd6cc798..754eadaff5 100644 --- a/distributed/tests/test_scheduler.py +++ b/distributed/tests/test_scheduler.py @@ -345,7 +345,26 @@ async def test_decide_worker_rootish_while_last_worker_is_retiring(c, s, a): await wait(xs + ys) -@pytest.mark.slow +from distributed import WorkerPlugin + + +class CountData(WorkerPlugin): + def __init__(self, keys): + self.keys = keys + self.worker = None + self.count = 0 + + def setup(self, worker): + self.worker = worker + + def transition(self, start, finish, *args, **kwargs): + count = 0 + for k in self.worker.data: + if k in self.keys: + count += 1 + self.count = max(self.count, count) + + @gen_cluster( nthreads=[("", 2)] * 4, client=True, @@ -359,33 +378,18 @@ async def test_graph_execution_width(c, s, *workers): The number of parallel work streams match the number of threads. """ - class Refcount: - "Track how many instances of this class exist; logs the count at creation and deletion" - - count = 0 - lock = dask.utils.SerializableLock() - log = [] - - def __init__(self): - with self.lock: - type(self).count += 1 - self.log.append(self.count) - - def __del__(self): - with self.lock: - self.log.append(self.count) - type(self).count -= 1 - - roots = [delayed(Refcount)() for _ in range(32)] + roots = [delayed(inc)(ix) for ix in range(32)] passthrough1 = [delayed(slowidentity)(r, delay=0) for r in roots] passthrough2 = [delayed(slowidentity)(r, delay=0) for r in passthrough1] done = [delayed(lambda r: None)(r) for r in passthrough2] - + await c.register_worker_plugin( + CountData(keys=[f.key for f in roots]), name="count-roots" + ) fs = c.compute(done) await wait(fs) - # NOTE: the max should normally equal `total_nthreads`. But some macOS CI machines - # are slow enough that they aren't able to reach the full parallelism of 8 threads. - assert max(Refcount.log) <= s.total_nthreads + + res = await c.run(lambda dask_worker: dask_worker.plugins["count-roots"].count) + assert all(0 < count <= 2 for count in res.values()) @gen_cluster(client=True, nthreads=[("", 1)]) diff --git a/distributed/tests/test_worker.py b/distributed/tests/test_worker.py index 6200878da1..b2a3b15872 100644 --- a/distributed/tests/test_worker.py +++ b/distributed/tests/test_worker.py @@ -26,7 +26,7 @@ from tornado.ioloop import IOLoop import dask -from dask import delayed, istask +from dask import delayed from dask.system import CPU_COUNT from dask.utils import tmpfile @@ -95,7 +95,6 @@ ExecuteFailureEvent, ExecuteSuccessEvent, RemoveReplicasEvent, - StealRequestEvent, ) pytestmark = pytest.mark.ci1 @@ -2052,7 +2051,7 @@ def __setstate__(self, state): self._thread_ident = threading.get_ident() return self - monkeypatch.setattr("distributed.worker.OFFLOAD_THRESHOLD", 1) + monkeypatch.setattr("distributed.comm.utils.OFFLOAD_THRESHOLD", 1) async with Worker(s.address, executor="offload") as w: from distributed.utils import _offload_executor @@ -2788,39 +2787,6 @@ async def test_forget_dependents_after_release(c, s, a): assert fut2.key not in {d.key for d in a.state.tasks[fut.key].dependents} -@pytest.mark.filterwarnings("ignore:Sending large graph of size") -@pytest.mark.filterwarnings("ignore:Large object of size") -@gen_cluster(client=True) -async def test_steal_during_task_deserialization(c, s, a, b, monkeypatch): - stealing_ext = s.extensions["stealing"] - await stealing_ext.stop() - - in_deserialize = asyncio.Event() - wait_in_deserialize = asyncio.Event() - - async def custom_worker_offload(func, *args): - res = func(*args) - if not istask(args) and istask(res): - in_deserialize.set() - await wait_in_deserialize.wait() - return res - - monkeypatch.setattr("distributed.worker.offload", custom_worker_offload) - obj = random.randbytes(OFFLOAD_THRESHOLD + 1) - fut = c.submit(lambda _: 41, obj, workers=[a.address], allow_other_workers=True) - - await in_deserialize.wait() - ts = s.tasks[fut.key] - a.handle_stimulus(StealRequestEvent(key=fut.key, stimulus_id="test")) - stealing_ext.scheduler.send_task_to_worker(b.address, ts) - - fut2 = c.submit(inc, fut, workers=[a.address]) - fut3 = c.submit(inc, fut2, workers=[a.address]) - wait_in_deserialize.set() - assert await fut2 == 42 - await fut3 - - @gen_cluster(client=True, config=NO_AMM) async def test_acquire_replicas(c, s, a, b): fut = c.submit(inc, 1, workers=[a.address]) diff --git a/distributed/tests/test_worker_metrics.py b/distributed/tests/test_worker_metrics.py index 8d823cc98c..059eba8566 100644 --- a/distributed/tests/test_worker_metrics.py +++ b/distributed/tests/test_worker_metrics.py @@ -150,7 +150,6 @@ async def test_custom_executor(c, s, a): await c.submit(sleep, 0.1) assert list(get_digests(a, "execute")) == [ - ("execute", span_id(s), "sleep", "deserialize", "seconds"), ("execute", span_id(s), "sleep", "executor", "seconds"), ("execute", span_id(s), "sleep", "other", "seconds"), ] @@ -160,18 +159,10 @@ async def test_custom_executor(c, s, a): ) -@gen_cluster(client=True, nthreads=[("", 1)]) -async def test_run_spec_deserialization(c, s, a): - """Test that deserialization of run_spec is metered""" - await c.submit(inc, 1, key="x") - assert 0 < a.digests_total["execute", span_id(s), "x", "deserialize", "seconds"] < 1 - - @gen_cluster(client=True) async def test_offload(c, s, a, b, monkeypatch): """Test that functions wrapped by offload() are metered""" monkeypatch.setattr(distributed.comm.utils, "OFFLOAD_THRESHOLD", 1) - monkeypatch.setattr(distributed.worker, "OFFLOAD_THRESHOLD", 1) x = c.submit(inc, 1, key="x", workers=[a.address]) y = c.submit(lambda x: None, x, key="y", workers=[b.address]) @@ -180,8 +171,6 @@ async def test_offload(c, s, a, b, monkeypatch): assert list(get_digests(b, {"offload", "serialize", "deserialize"})) == [ ("gather-dep", "offload", "seconds"), ("gather-dep", "deserialize", "seconds"), - ("execute", span_id(s), "y", "offload", "seconds"), - ("execute", span_id(s), "y", "deserialize", "seconds"), ("get-data", "offload", "seconds"), ("get-data", "serialize", "seconds"), ] @@ -364,7 +353,6 @@ def f(): await wait(c.submit(f, key="x")) assert list(get_digests(a)) == [ - ("execute", span_id(s), "x", "deserialize", "seconds"), ("execute", span_id(s), "x", "I/O", "seconds"), ("execute", span_id(s), "x", "thread-cpu", "seconds"), ("execute", span_id(s), "x", "thread-noncpu", "seconds"), @@ -387,7 +375,6 @@ async def f(): await wait(c.submit(f, key="x")) assert list(get_digests(a)) == [ - ("execute", span_id(s), "x", "deserialize", "seconds"), ("execute", span_id(s), "x", "I/O", "seconds"), ("execute", span_id(s), "x", "thread-noncpu", "seconds"), ("execute", span_id(s), "x", "other", "seconds"), @@ -431,7 +418,6 @@ def f(): a_metrics = get_digests(a) assert list(s_metrics) == [ - ("execute", "x", "deserialize", "seconds"), ("execute", "x", ("foo", 1), "seconds"), ("execute", "x", None, "custom"), ("execute", "x", "thread-cpu", "seconds"), @@ -536,7 +522,6 @@ async def test_send_metrics_to_scheduler(c, s, a, b): s_metrics = get_digests(s) expect_worker = [ - ("execute", None, "x", "deserialize", "seconds"), ("execute", None, "x", "thread-cpu", "seconds"), ("execute", None, "x", "thread-noncpu", "seconds"), ("execute", None, "x", "executor", "seconds"), @@ -583,7 +568,6 @@ async def test_no_spans_extension(c, s, a): s_metrics = get_digests(s) expect_worker = [ ("execute", None, "x", "failed", "seconds"), - ("execute", None, "y", "deserialize", "seconds"), ("execute", None, "y", "thread-cpu", "seconds"), ("execute", None, "y", "thread-noncpu", "seconds"), ("execute", None, "y", "executor", "seconds"), diff --git a/distributed/tests/test_worker_state_machine.py b/distributed/tests/test_worker_state_machine.py index c26ead57e6..446ea5cee2 100644 --- a/distributed/tests/test_worker_state_machine.py +++ b/distributed/tests/test_worker_state_machine.py @@ -426,15 +426,12 @@ def test_computetask_dummy(): nbytes={}, priority=(0,), duration=1.0, - run_spec=None, + run_spec=ComputeTaskEvent.dummy_runspec(), resource_restrictions={}, actor=False, annotations={}, span_id=None, stimulus_id="s", - function=None, - args=None, - kwargs=None, run_id=0, ) diff --git a/distributed/worker_state_machine.py b/distributed/worker_state_machine.py index 279156a6b0..409133942b 100644 --- a/distributed/worker_state_machine.py +++ b/distributed/worker_state_machine.py @@ -775,6 +775,14 @@ def to_loggable(self, *, handled: float) -> StateMachineEvent: def _after_from_dict(self) -> None: self.run_spec = None + @classmethod + def _f(cls) -> None: + return + + @classmethod + def dummy_runspec(cls) -> tuple[Callable, tuple, dict]: + return (cls._f, (), {}) + @staticmethod def dummy( key: str, @@ -799,7 +807,7 @@ def dummy( nbytes=nbytes or {k: 1 for k in who_has or ()}, priority=priority, duration=duration, - run_spec=None, + run_spec=ComputeTaskEvent.dummy_runspec(), resource_restrictions=resource_restrictions or {}, actor=actor, annotations=annotations or {}, From eb922ec8d9814461288a379256351bd7401a25f3 Mon Sep 17 00:00:00 2001 From: fjetter Date: Wed, 9 Aug 2023 15:10:43 +0200 Subject: [PATCH 3/5] multiple test fixes --- distributed/tests/test_cancelled_state.py | 14 +++----------- distributed/tests/test_spans.py | 3 --- distributed/tests/test_worker_state_machine.py | 3 ++- distributed/utils_test.py | 2 -- 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/distributed/tests/test_cancelled_state.py b/distributed/tests/test_cancelled_state.py index 1609972fa8..8daa83bf87 100644 --- a/distributed/tests/test_cancelled_state.py +++ b/distributed/tests/test_cancelled_state.py @@ -1111,12 +1111,11 @@ def test_workerstate_resumed_waiting_to_flight(ws): assert ws.tasks["x"].state == "flight" -@pytest.mark.parametrize("critical_section", ["execute", "deserialize_task"]) @pytest.mark.parametrize("resume_inside_critical_section", [False, True]) @pytest.mark.parametrize("resumed_status", ["executing", "resumed"]) @gen_cluster(client=True, nthreads=[("", 1)]) async def test_execute_preamble_early_cancel( - c, s, b, critical_section, resume_inside_critical_section, resumed_status + c, s, b, resume_inside_critical_section, resumed_status ): """Test multiple race conditions in the preamble of Worker.execute(), which used to cause a task to remain permanently in resumed state or to crash the worker through @@ -1129,15 +1128,8 @@ async def test_execute_preamble_early_cancel( test_worker.py::test_execute_preamble_abort_retirement """ async with BlockedExecute(s.address) as a: - if critical_section == "execute": - in_ev = a.in_execute - block_ev = a.block_execute - a.block_deserialize_task.set() - else: - assert critical_section == "deserialize_task" - in_ev = a.in_deserialize_task - block_ev = a.block_deserialize_task - a.block_execute.set() + in_ev = a.in_execute + block_ev = a.block_execute async def resume(): if resumed_status == "executing": diff --git a/distributed/tests/test_spans.py b/distributed/tests/test_spans.py index 23ca4789d3..b6210a1710 100644 --- a/distributed/tests/test_spans.py +++ b/distributed/tests/test_spans.py @@ -518,14 +518,12 @@ async def test_worker_metrics(c, s, a, b): # metrics for foo include self and its child bar assert list(foo_metrics) == [ - ("execute", "x", "deserialize", "seconds"), ("execute", "x", "thread-cpu", "seconds"), ("execute", "x", "thread-noncpu", "seconds"), ("execute", "x", "executor", "seconds"), ("execute", "x", "other", "seconds"), ("execute", "x", "memory-read", "count"), ("execute", "x", "memory-read", "bytes"), - ("execute", "y", "deserialize", "seconds"), ("execute", "y", "thread-cpu", "seconds"), ("execute", "y", "thread-noncpu", "seconds"), ("execute", "y", "executor", "seconds"), @@ -536,7 +534,6 @@ async def test_worker_metrics(c, s, a, b): list(bar0_metrics) == list(bar1_metrics) == [ - ("execute", "y", "deserialize", "seconds"), ("execute", "y", "thread-cpu", "seconds"), ("execute", "y", "thread-noncpu", "seconds"), ("execute", "y", "executor", "seconds"), diff --git a/distributed/tests/test_worker_state_machine.py b/distributed/tests/test_worker_state_machine.py index 446ea5cee2..6b21f849b3 100644 --- a/distributed/tests/test_worker_state_machine.py +++ b/distributed/tests/test_worker_state_machine.py @@ -390,6 +390,7 @@ def f(arg): annotations={}, span_id=None, stimulus_id="test", + run_id=5, ) assert ev.run_spec is not None ev2 = ev.to_loggable(handled=11.22) @@ -402,7 +403,7 @@ def f(arg): "who_has": {"y": ["w1"]}, "nbytes": {"y": 123}, "priority": [0], - "run_spec": [None, None, None], + "run_spec": None, "duration": 123.45, "resource_restrictions": {}, "actor": False, diff --git a/distributed/utils_test.py b/distributed/utils_test.py index e7bdb70b0b..2d32ae0751 100644 --- a/distributed/utils_test.py +++ b/distributed/utils_test.py @@ -2287,8 +2287,6 @@ def f(in_task, block_task): def __init__(self, *args, **kwargs): self.in_execute = asyncio.Event() self.block_execute = asyncio.Event() - self.in_deserialize_task = asyncio.Event() - self.block_deserialize_task = asyncio.Event() self.in_execute_exit = asyncio.Event() self.block_execute_exit = asyncio.Event() From 73e75e578cd43ab5533a53e08ff955f5bfb0865a Mon Sep 17 00:00:00 2001 From: fjetter Date: Wed, 9 Aug 2023 16:30:00 +0200 Subject: [PATCH 4/5] leftover --- distributed/tests/test_worker.py | 1 - distributed/utils_test.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/distributed/tests/test_worker.py b/distributed/tests/test_worker.py index b2a3b15872..4422e2dd51 100644 --- a/distributed/tests/test_worker.py +++ b/distributed/tests/test_worker.py @@ -3617,7 +3617,6 @@ async def test_execute_preamble_abort_retirement(c, s): """ async with BlockedExecute(s.address) as a: await c.wait_for_workers(1) - a.block_deserialize_task.set() # Uninteresting in this test x = await c.scatter({"x": 1}, workers=[a.address]) y = c.submit(inc, 1, key="y", workers=[a.address]) diff --git a/distributed/utils_test.py b/distributed/utils_test.py index 2d32ae0751..8fc4ec4d38 100644 --- a/distributed/utils_test.py +++ b/distributed/utils_test.py @@ -2253,10 +2253,6 @@ class BlockedExecute(Worker): method and then does not proceed, thus leaving the task in executing state indefinitely, until the test sets `block_execute`. - After that, the worker sets `in_deserialize_task` to simulate the moment when a - large run_spec is being deserialized in a separate thread. The worker will block - again until the test sets `block_deserialize_task`. - Finally, the worker sets `in_execute_exit` when execute() terminates, but before the worker state has processed its exit callback. The worker will block one last time until the test sets `block_execute_exit`. From 43b261f3b0111ab9710894308fabf32765870f9f Mon Sep 17 00:00:00 2001 From: Florian Jetter Date: Thu, 10 Aug 2023 17:43:48 +0200 Subject: [PATCH 5/5] Update distributed/worker_state_machine.py Co-authored-by: Hendrik Makait --- distributed/worker_state_machine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/worker_state_machine.py b/distributed/worker_state_machine.py index 409133942b..7d22d7075d 100644 --- a/distributed/worker_state_machine.py +++ b/distributed/worker_state_machine.py @@ -777,7 +777,7 @@ def _after_from_dict(self) -> None: @classmethod def _f(cls) -> None: - return + return # pragma: nocover @classmethod def dummy_runspec(cls) -> tuple[Callable, tuple, dict]: