diff --git a/src/dstack/_internal/cli/utils/fleet.py b/src/dstack/_internal/cli/utils/fleet.py index e558dd9b7..751b89919 100644 --- a/src/dstack/_internal/cli/utils/fleet.py +++ b/src/dstack/_internal/cli/utils/fleet.py @@ -24,6 +24,9 @@ def get_fleets_table(fleets: List[Fleet], verbose: bool = False) -> Table: table.add_column("STATUS") table.add_column("CREATED") + if verbose: + table.add_column("ERROR") + for fleet in fleets: for i, instance in enumerate(fleet.instances): resources = "" @@ -55,6 +58,13 @@ def get_fleets_table(fleets: List[Fleet], verbose: bool = False) -> Table: status, pretty_date(instance.created), ] + + if verbose: + error = "" + if instance.status == InstanceStatus.TERMINATED and instance.termination_reason: + error = f"{instance.termination_reason}" + row.append(error) + table.add_row(*row) if len(fleet.instances) == 0 and fleet.status != FleetStatus.TERMINATING: diff --git a/src/dstack/_internal/core/models/pools.py b/src/dstack/_internal/core/models/pools.py index 69921fc8a..0c55464bd 100644 --- a/src/dstack/_internal/core/models/pools.py +++ b/src/dstack/_internal/core/models/pools.py @@ -27,6 +27,7 @@ class Instance(CoreModel): hostname: Optional[str] = None status: InstanceStatus unreachable: bool = False + termination_reason: Optional[str] = None created: datetime.datetime region: Optional[str] = None price: Optional[float] = None diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index c91cce0f3..9c009d87b 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -187,7 +187,7 @@ async def _terminate_idle_instance(instance: InstanceModel): async def _add_remote(instance: InstanceModel) -> None: - logger.info("Adding remote instance %s...", instance.name) + logger.info("Adding ssh instance %s...", instance.name) if instance.status == InstanceStatus.PENDING: instance.status = InstanceStatus.PROVISIONING @@ -196,11 +196,9 @@ async def _add_remote(instance: InstanceModel) -> None: ) + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS) if retry_duration_deadline < get_current_datetime(): instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() - instance.termination_reason = "The proivisioning timeout expired" + instance.termination_reason = "Proivisioning timeout expired" logger.warning( - "Failed to start the instance in %s seconds. Terminate instance %s", + "Failed to start instance in %s seconds. Terminating...", PROVISIONING_TIMEOUT_SECONDS, instance.name, extra={ @@ -221,11 +219,9 @@ async def _add_remote(instance: InstanceModel) -> None: ] except (ValueError, PasswordRequiredException): instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() instance.termination_reason = "Unsupported private SSH key type" logger.warning( - "Failed to start instance %s: unsupported private SSH key type", + "Failed to add instance %s: unsupported private SSH key type", instance.name, extra={ "instance_name": instance.name, @@ -253,7 +249,7 @@ async def _add_remote(instance: InstanceModel) -> None: ) except ProvisioningError as e: logger.warning( - "Provisioning the instance '%s' could not be completed because of the error: %s", + "Provisioning instance %s could not be completed because of the error: %s", instance.name, e, ) @@ -275,13 +271,9 @@ async def _add_remote(instance: InstanceModel) -> None: ) if instance_network is not None and internal_ip is None: instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() - instance.termination_reason = ( - "Unable to locate the internal ip-address for the given network" - ) + instance.termination_reason = "Failed to locate internal IP address on the given network" logger.warning( - "Failed to configure internal ip-address on instance %s. Terminate it", + "Failed to add instance %s: failed to locate internal IP address on the given network", instance.name, extra={ "instance_name": instance.name, @@ -391,8 +383,6 @@ async def _create_instance(instance: InstanceModel) -> None: or instance.instance_configuration is None ): instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() instance.termination_reason = "Empty profile, requirements or instance_configuration" instance.last_retry_at = get_current_datetime() logger.warning( @@ -417,8 +407,6 @@ async def _create_instance(instance: InstanceModel) -> None: ) except ValidationError as e: instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() instance.termination_reason = ( f"Error to parse profile, requirements or instance_configuration: {e}" ) @@ -440,11 +428,9 @@ async def _create_instance(instance: InstanceModel) -> None: retry_duration_deadline = _get_retry_duration_deadline(instance, retry) if get_current_datetime() > retry_duration_deadline: instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() instance.termination_reason = "Retry duration expired" logger.warning( - "Retry duration expired. Terminate instance %s", + "Retry duration expired. Terminating instance %s", instance.name, extra={ "instance_name": instance.name, @@ -523,8 +509,6 @@ async def _create_instance(instance: InstanceModel) -> None: if not should_retry: instance.status = InstanceStatus.TERMINATED - instance.deleted = True - instance.deleted_at = get_current_datetime() instance.termination_reason = "No offers found" logger.info( "No offers found. Terminated instance %s", @@ -760,6 +744,7 @@ def _need_to_wait_fleet_provisioning(instance: InstanceModel) -> bool: if ( instance.id == instance.fleet.instances[0].id or instance.fleet.instances[0].job_provisioning_data is not None + or instance.fleet.instances[0].status == InstanceStatus.TERMINATED ): return False fleet = fleet_model_to_fleet(instance.fleet) diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index 67878d520..435e5f70a 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -418,4 +418,7 @@ def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[ for instance in fleet_model.instances: if instance_nums is not None and instance.instance_num not in instance_nums: continue - instance.status = InstanceStatus.TERMINATING + if instance.status == InstanceStatus.TERMINATED: + instance.deleted = True + else: + instance.status = InstanceStatus.TERMINATING diff --git a/src/dstack/_internal/server/services/pools.py b/src/dstack/_internal/server/services/pools.py index 0605f5d47..4a57cd177 100644 --- a/src/dstack/_internal/server/services/pools.py +++ b/src/dstack/_internal/server/services/pools.py @@ -230,6 +230,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance: instance_num=instance_model.instance_num, status=instance_model.status, unreachable=instance_model.unreachable, + termination_reason=instance_model.termination_reason, created=instance_model.created_at.replace(tzinfo=timezone.utc), ) diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index 824098b23..931aff6f1 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -214,6 +214,7 @@ async def test_creates_fleet(self, test_db, session: AsyncSession, client: Async "hostname": None, "status": "pending", "unreachable": False, + "termination_reason": None, "created": "2023-01-02T03:04:00+00:00", "pool_name": None, "backend": None, @@ -335,6 +336,7 @@ async def test_creates_ssh_fleet(self, test_db, session: AsyncSession, client: A "hostname": "1.1.1.1", "status": "pending", "unreachable": False, + "termination_reason": None, "created": "2023-01-02T03:04:00+00:00", "region": "remote", "price": 0.0, diff --git a/src/tests/_internal/server/routers/test_pools.py b/src/tests/_internal/server/routers/test_pools.py index 0d7a6c256..cb697c4cb 100644 --- a/src/tests/_internal/server/routers/test_pools.py +++ b/src/tests/_internal/server/routers/test_pools.py @@ -323,6 +323,7 @@ async def test_show_pool(self, test_db, session: AsyncSession, client: AsyncClie "hostname": "running_instance.ip", "status": "idle", "unreachable": False, + "termination_reason": None, "created": "2023-01-02T03:04:00+00:00", "pool_name": None, "region": "en", @@ -489,6 +490,7 @@ async def test_remove_instance(self, test_db, session: AsyncSession, client: Asy "hostname": "running_instance.ip", "status": "terminating", "unreachable": False, + "termination_reason": None, "created": "2023-01-02T03:04:00+00:00", "pool_name": None, "region": "en", diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index f87eb033e..3574a0df5 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -1021,6 +1021,7 @@ async def test_creates_instance(self, test_db, session: AsyncSession, client: As "hostname": None, "status": "pending", "unreachable": False, + "termination_reason": None, "created": result["created"], "pool_name": None, "region": None,